/usr/lib/python3/dist-packages/geopy/geocoders/wiki_semantic.py is in python3-geopy 0.95.1-1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 | import xml.dom.minidom
from urllib.request import urlopen
from geopy.geocoders.base import Geocoder
from geopy.point import Point
from geopy.location import Location
from geopy import util
try:
from BeautifulSoup import BeautifulSoup
except ImportError:
util.logger.warn("BeautifulSoup was not found. " \
"The SemanticMediaWiki geocoder will not work.")
try:
set
except NameError:
from sets import Set as set
class SemanticMediaWiki(Geocoder):
def __init__(self, format_url, attributes=None, relations=None,
prefer_semantic=False, transform_string=None):
self.format_url = format_url
self.attributes = attributes
self.relations = relations
self.prefer_semantic = prefer_semantic
self.transform_string = transform_string
def get_url(self, string):
return self.format_url % self.transform_string(string)
def parse_rdf_link(self, page, mime_type='application/rdf+xml'):
"""Parse the URL of the RDF link from the <head> of ``page``."""
soup = BeautifulSoup(page)
link = soup.head.find('link', rel='alternate', type=mime_type)
return link and link['href'] or None
def parse_rdf_things(self, data):
dom = xml.dom.minidom.parseString(data)
thing_map = {}
things = dom.getElementsByTagName('smw:Thing')
things.reverse()
for thing in things:
name = thing.attributes['rdf:about'].value
articles = thing.getElementsByTagName('smw:hasArticle')
things[name] = articles[0].attributes['rdf:resource'].value
return (things, thing)
def transform_semantic(self, string):
"""Normalize semantic attribute and relation names by replacing spaces
with underscores and capitalizing the result."""
return string.replace(' ', '_').capitalize()
def get_relations(self, thing, relations=None):
if relations is None:
relations = self.relations
for relation in relations:
relation = self.transform_semantic(relation)
for node in thing.getElementsByTagName('relation:' + relation):
resource = node.attributes['rdf:resource'].value
yield (relation, resource)
def get_attributes(self, thing, attributes=None):
if attributes is None:
attributes = self.attributes
for attribute in attributes:
attribute = self.transform_semantic(attribute)
for node in thing.getElementsByTagName('attribute:' + attribute):
value = node.firstChild.nodeValue.strip()
yield (attribute, value)
def get_thing_label(self, thing):
return util.get_first_text(thing, 'rdfs:label')
def geocode_url(self, url, attempted=None):
if attempted is None:
attempted = set()
util.logger.debug("Fetching %s..." % url)
page = urlopen(url)
soup = BeautifulSoup(page)
rdf_url = self.parse_rdf_link(soup)
util.logger.debug("Fetching %s..." % rdf_url)
page = urlopen(rdf_url)
things, thing = self.parse_rdf(page)
name = self.get_label(thing)
attributes = self.get_attributes(thing)
for attribute, value in attributes:
latitude, longitude = util.parse_geo(value)
if None not in (latitude, longitude):
break
if None in (latitude, longitude):
relations = self.get_relations(thing)
for relation, resource in relations:
url = things.get(resource, resource)
if url in attempted: # Avoid cyclic relationships.
continue
attempted.add(url)
name, (latitude, longitude) = self.geocode_url(url, attempted)
if None not in (name, latitude, longitude):
break
return (name, (latitude, longitude))
|