[fix] merge infoboxes based on weight
also minor changes in attributes and images from wikidata
This commit is contained in:
		
							parent
							
								
									c2e4014287
								
							
						
					
					
						commit
						ad58b14be7
					
				| @ -35,7 +35,7 @@ url_detail = wikidata_api\ | ||||
| 
 | ||||
| url_map = 'https://www.openstreetmap.org/'\ | ||||
|     + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M' | ||||
| url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500' | ||||
| url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400' | ||||
| 
 | ||||
| # xpaths | ||||
| wikidata_ids_xpath = '//div/ul[@class="wikibase-disambiguation"]/li/a/@title' | ||||
| @ -162,6 +162,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale): | ||||
| 
 | ||||
|     # INFOBOX ATTRIBUTES (ROWS) | ||||
| 
 | ||||
|     # DATES | ||||
|     # inception date | ||||
|     add_attribute(attributes, result, 'P571', date=True) | ||||
|     # dissolution date | ||||
| @ -170,11 +171,14 @@ def getDetail(jsonresponse, wikidata_id, language, locale): | ||||
|     add_attribute(attributes, result, 'P580', date=True) | ||||
|     # end date | ||||
|     add_attribute(attributes, result, 'P582', date=True) | ||||
| 
 | ||||
|     # date of birth | ||||
|     add_attribute(attributes, result, 'P569', date=True) | ||||
|     # date of death | ||||
|     add_attribute(attributes, result, 'P570', date=True) | ||||
|     # date of spacecraft launch | ||||
|     add_attribute(attributes, result, 'P619', date=True) | ||||
|     # date of spacecraft landing | ||||
|     add_attribute(attributes, result, 'P620', date=True) | ||||
| 
 | ||||
|     # nationality | ||||
|     add_attribute(attributes, result, 'P27') | ||||
| @ -201,7 +205,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale): | ||||
|     # area | ||||
|     add_attribute(attributes, result, 'P2046') | ||||
|     # currency | ||||
|     add_attribute(attributes, result, 'P38') | ||||
|     add_attribute(attributes, result, 'P38', trim=True) | ||||
|     # heigth (building) | ||||
|     add_attribute(attributes, result, 'P2048') | ||||
| 
 | ||||
| @ -230,6 +234,10 @@ def getDetail(jsonresponse, wikidata_id, language, locale): | ||||
|     add_attribute(attributes, result, 'P264') | ||||
|     # publisher | ||||
|     add_attribute(attributes, result, 'P123') | ||||
|     # original network | ||||
|     add_attribute(attributes, result, 'P449') | ||||
|     # distributor | ||||
|     add_attribute(attributes, result, 'P750') | ||||
|     # composer | ||||
|     add_attribute(attributes, result, 'P86') | ||||
|     # publication date | ||||
| @ -266,6 +274,10 @@ def getDetail(jsonresponse, wikidata_id, language, locale): | ||||
|     add_attribute(attributes, result, 'P112') | ||||
|     # legal form (company/organization) | ||||
|     add_attribute(attributes, result, 'P1454') | ||||
|     # operator | ||||
|     add_attribute(attributes, result, 'P137') | ||||
|     # crew members (tripulation) | ||||
|     add_attribute(attributes, result, 'P1029') | ||||
|     # taxon | ||||
|     add_attribute(attributes, result, 'P225') | ||||
|     # chemical formula | ||||
| @ -300,8 +312,8 @@ def getDetail(jsonresponse, wikidata_id, language, locale): | ||||
| 
 | ||||
| # only returns first match | ||||
| def add_image(result): | ||||
|     # P18: image, P154: logo, P242: map, P41: flag, P2716: collage, P2910: icon | ||||
|     property_ids = ['P18', 'P154', 'P242', 'P41', 'P2716', 'P2910'] | ||||
|     # P15: route map, P242: locator map, P154: logo, P18: image, P242: map, P41: flag, P2716: collage, P2910: icon | ||||
|     property_ids = ['P15', 'P242', 'P154', 'P18', 'P242', 'P41', 'P2716', 'P2910'] | ||||
| 
 | ||||
|     for property_id in property_ids: | ||||
|         image = result.xpath(property_xpath.replace('{propertyid}', property_id)) | ||||
| @ -320,6 +332,7 @@ def add_attribute(attributes, result, property_id, default_label=None, date=Fals | ||||
|             label = default_label | ||||
|         else: | ||||
|             label = extract_text(attribute[0].xpath(label_xpath)) | ||||
|             label = label[0].upper() + label[1:] | ||||
| 
 | ||||
|         if date: | ||||
|             trim = True | ||||
| @ -369,6 +382,7 @@ def add_url(urls, result, property_id=None, default_label=None, url_prefix=None, | ||||
|             dom_element = dom_element[0] | ||||
|             if not default_label: | ||||
|                 label = extract_text(dom_element.xpath(label_xpath)) | ||||
|                 label = label[0].upper() + label[1:] | ||||
| 
 | ||||
|             if link_type == 'geo': | ||||
|                 links.append(get_geolink(dom_element)) | ||||
|  | ||||
| @ -43,6 +43,19 @@ def compare_urls(url_a, url_b): | ||||
| 
 | ||||
| 
 | ||||
| def merge_two_infoboxes(infobox1, infobox2): | ||||
|     # get engines weights | ||||
|     if hasattr(engines[infobox1['engine']], 'weight'): | ||||
|         weight1 = engines[infobox1['engine']].weight | ||||
|     else: | ||||
|         weight1 = 1 | ||||
|     if hasattr(engines[infobox2['engine']], 'weight'): | ||||
|         weight2 = engines[infobox2['engine']].weight | ||||
|     else: | ||||
|         weight2 = 1 | ||||
| 
 | ||||
|     if weight2 > weight1: | ||||
|         infobox1['engine'] = infobox2['engine'] | ||||
| 
 | ||||
|     if 'urls' in infobox2: | ||||
|         urls1 = infobox1.get('urls', None) | ||||
|         if urls1 is None: | ||||
| @ -64,6 +77,8 @@ def merge_two_infoboxes(infobox1, infobox2): | ||||
|         img2 = infobox2.get('img_src') | ||||
|         if img1 is None: | ||||
|             infobox1['img_src'] = img2 | ||||
|         elif weight2 > weight1: | ||||
|             infobox1['img_src'] = img2 | ||||
| 
 | ||||
|     if 'attributes' in infobox2: | ||||
|         attributes1 = infobox1.get('attributes', None) | ||||
| @ -77,6 +92,7 @@ def merge_two_infoboxes(infobox1, infobox2): | ||||
|                 attributeSet.add(attribute.get('label', None)) | ||||
| 
 | ||||
|         for attribute in infobox2.get('attributes', []): | ||||
|             if attribute.get('label', None) not in attributeSet: | ||||
|                 attributes1.append(attribute) | ||||
| 
 | ||||
|     if 'content' in infobox2: | ||||
|  | ||||
| @ -105,6 +105,7 @@ engines: | ||||
|   - name : ddg definitions | ||||
|     engine : duckduckgo_definitions | ||||
|     shortcut : ddd | ||||
|     weight : 2 | ||||
|     disabled : True | ||||
| 
 | ||||
|   - name : digg | ||||
| @ -127,6 +128,7 @@ engines: | ||||
|   - name : wikidata | ||||
|     engine : wikidata | ||||
|     shortcut : wd | ||||
|     weight : 2 | ||||
| 
 | ||||
|   - name : duckduckgo | ||||
|     engine : duckduckgo | ||||
|  | ||||
| @ -95,14 +95,14 @@ class TestWikidataEngine(SearxTestCase): | ||||
| 
 | ||||
|         results = wikidata.getDetail(response, "Q123", "yua", "yua_MX") | ||||
|         self.assertEqual(len(results), 2) | ||||
|         self.assertEqual(results[0]['title'], 'official website') | ||||
|         self.assertEqual(results[0]['title'], 'Official website') | ||||
|         self.assertEqual(results[0]['url'], 'https://officialsite.com') | ||||
| 
 | ||||
|         self.assertEqual(results[1]['infobox'], 'Test') | ||||
|         self.assertEqual(results[1]['id'], None) | ||||
|         self.assertEqual(results[1]['content'], 'Description') | ||||
|         self.assertEqual(results[1]['attributes'], []) | ||||
|         self.assertEqual(results[1]['urls'][0]['title'], 'official website') | ||||
|         self.assertEqual(results[1]['urls'][0]['title'], 'Official website') | ||||
|         self.assertEqual(results[1]['urls'][0]['url'], 'https://officialsite.com') | ||||
|         self.assertEqual(results[1]['urls'][1]['title'], 'Wikipedia (en)') | ||||
|         self.assertEqual(results[1]['urls'][1]['url'], 'https://en.wikipedia.org/wiki/Test') | ||||
| @ -141,7 +141,8 @@ class TestWikidataEngine(SearxTestCase): | ||||
|         html_etree = fromstring(html) | ||||
| 
 | ||||
|         image_src = wikidata.add_image(html_etree) | ||||
|         self.assertEqual(image_src, "https://commons.wikimedia.org/wiki/Special:FilePath/image.png?width=500") | ||||
|         self.assertEqual(image_src, | ||||
|                          "https://commons.wikimedia.org/wiki/Special:FilePath/image.png?width=500&height=400") | ||||
| 
 | ||||
|         html = u""" | ||||
|         <div> | ||||
| @ -196,7 +197,8 @@ class TestWikidataEngine(SearxTestCase): | ||||
|         html_etree = fromstring(html) | ||||
| 
 | ||||
|         image_src = wikidata.add_image(html_etree) | ||||
|         self.assertEqual(image_src, "https://commons.wikimedia.org/wiki/Special:FilePath/logo.png?width=500") | ||||
|         self.assertEqual(image_src, | ||||
|                          "https://commons.wikimedia.org/wiki/Special:FilePath/logo.png?width=500&height=400") | ||||
| 
 | ||||
|     def test_add_attribute(self): | ||||
|         html = u""" | ||||
| @ -234,7 +236,7 @@ class TestWikidataEngine(SearxTestCase): | ||||
| 
 | ||||
|         wikidata.add_attribute(attributes, html_etree, "P27") | ||||
|         self.assertEqual(len(attributes), 1) | ||||
|         self.assertEqual(attributes[0]["label"], "country of citizenship") | ||||
|         self.assertEqual(attributes[0]["label"], "Country of citizenship") | ||||
|         self.assertEqual(attributes[0]["value"], "United Kingdom") | ||||
| 
 | ||||
|         html = u""" | ||||
| @ -269,7 +271,7 @@ class TestWikidataEngine(SearxTestCase): | ||||
|         html_etree = fromstring(html) | ||||
|         wikidata.add_attribute(attributes, html_etree, "P569", date=True) | ||||
|         self.assertEqual(len(attributes), 1) | ||||
|         self.assertEqual(attributes[0]["label"], "date of birth") | ||||
|         self.assertEqual(attributes[0]["label"], "Date of birth") | ||||
|         self.assertEqual(attributes[0]["value"], "27 January 1832") | ||||
| 
 | ||||
|         html = u""" | ||||
| @ -317,7 +319,7 @@ class TestWikidataEngine(SearxTestCase): | ||||
|         html_etree = fromstring(html) | ||||
|         wikidata.add_attribute(attributes, html_etree, "P6") | ||||
|         self.assertEqual(len(attributes), 1) | ||||
|         self.assertEqual(attributes[0]["label"], "head of government") | ||||
|         self.assertEqual(attributes[0]["label"], "Head of government") | ||||
|         self.assertEqual(attributes[0]["value"], "Old Prime Minister, Actual Prime Minister") | ||||
| 
 | ||||
|         attributes = [] | ||||
| @ -355,7 +357,7 @@ class TestWikidataEngine(SearxTestCase): | ||||
|         html_etree = fromstring(html) | ||||
|         wikidata.add_url(urls, html_etree, 'P856') | ||||
|         self.assertEquals(len(urls), 1) | ||||
|         self.assertIn({'title': 'official website', 'url': 'https://searx.me/'}, urls) | ||||
|         self.assertIn({'title': 'Official website', 'url': 'https://searx.me/'}, urls) | ||||
|         urls = [] | ||||
|         results = [] | ||||
|         wikidata.add_url(urls, html_etree, 'P856', 'custom label', results=results) | ||||
| @ -403,8 +405,8 @@ class TestWikidataEngine(SearxTestCase): | ||||
|         html_etree = fromstring(html) | ||||
|         wikidata.add_url(urls, html_etree, 'P856') | ||||
|         self.assertEquals(len(urls), 2) | ||||
|         self.assertIn({'title': 'official website', 'url': 'http://www.worldofwarcraft.com'}, urls) | ||||
|         self.assertIn({'title': 'official website', 'url': 'http://eu.battle.net/wow/en/'}, urls) | ||||
|         self.assertIn({'title': 'Official website', 'url': 'http://www.worldofwarcraft.com'}, urls) | ||||
|         self.assertIn({'title': 'Official website', 'url': 'http://eu.battle.net/wow/en/'}, urls) | ||||
| 
 | ||||
|     def test_get_imdblink(self): | ||||
|         html = u""" | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 marc
						marc