"""
Computing geo-coordinates for homelands of language groups, i.e. languoids of level `family`.
Various ways of computing "homelands" for language groups have been proposed in the literature
since Sapir 1916. This module provides implementations of some of the simpler algorithms.
"""
import gzip
import json
import random
import typing
import decimal
import pathlib
import collections
try:
from shapely.geometry import shape, GeometryCollection, MultiPoint, Point
from shapely.ops import nearest_points
import pyproj
geo = True
except ImportError: # pragma: no cover
geo = False
import pyglottolog
from pyglottolog.languoids import Languoid
__all__ = ['compute', 'md', 'recursive_centroids']
random.seed(12345)
def _worlds_land_masses_dict():
res = {}
for p in pathlib.Path(pyglottolog.__file__).parent.joinpath('data').glob('*.geojson.gz'):
with gzip.open(p, mode='rt', encoding='utf8') as fp:
res[p.stem] = GeometryCollection([
shape(f["geometry"]).buffer(0) for f in json.loads(fp.read())['features']])
return res
[docs]def compute(api: pyglottolog.Glottolog,
method: typing.Callable[
[typing.List[Languoid]],
typing.Dict[str, typing.Tuple[decimal.Decimal, decimal.Decimal]]])\
-> typing.Dict[str, typing.Tuple[decimal.Decimal, decimal.Decimal]]:
"""
Compute homelands for applicable Glottolog subgroups using a method implemented in this module
or any callable with appropriate signature.
"""
if not geo: # pragma: no cover
raise ValueError('Computing homelands requires the "geo" extra, installed via '
'"pip install pyglottolog[geo]"')
return method(_l1_languages_with_coordinates(api))
def geodist(geod, p1, p2):
return geod.inv(p1[1], p1[0], p2[1], p2[0])[2]
[docs]def md(langs: typing.List[Languoid])\
-> typing.Dict[str, typing.Tuple[decimal.Decimal, decimal.Decimal]]:
"""
Compute homeland coordinates for a language group (and its subgroups) as described as
"md" method in "Testing methods of linguistic homeland detection using synthetic data"
by Søren Wichmann and Taraka Rama
https://doi.org/10.1098/rstb.2020.0202
Wichmann and Rama 2021:
In the third approach, abbreviated ‘md’ for ‘minimal distance’, we compute the average
distance (as the crow flies) from each language to all the other languages. The location
of the language that has the smallest average distance to the others is equated with the
homeland.
We use the `pyproj.Geod.inv` method to compute the great-circle distance between two points.
.. seealso: https://pyproj4.github.io/pyproj/stable/api/geod.html
"""
# Compute minimal distances per group:
geod = pyproj.Geod(ellps='WGS84')
grouped_languages = collections.defaultdict(list)
for lang in langs:
for _, gc, _ in lang.lineage:
grouped_languages[gc].append(lang)
homelands = {}
for group, lgs in grouped_languages.items():
if len(lgs) == 1: # pragma: no cover
homelands[group] = (lgs[0].latitude, lgs[0].longitude)
continue
# We shuffle the coordinates to avoid returning the first minimal-distance location in the
# given order.
coords = [(lg.latitude, lg.longitude) for lg in lgs]
random.shuffle(coords)
mindist, mincoord = None, None
for i, coord in enumerate(coords):
dist = sum(geodist(geod, coord, p) for j, p in enumerate(coords) if i != j)
if (mindist is None) or (dist < mindist):
mindist, mincoord = dist, coord
homelands[group] = mincoord
return homelands
[docs]def recursive_centroids(langs: typing.List[Languoid])\
-> typing.Dict[str, typing.Tuple[decimal.Decimal, decimal.Decimal]]:
"""
Recursively compute homelands of subgroups from the homelands of their immediate children in
the classification.
1. The homeland of a single language is its geographic coordinate.
2. The homeland of a set of coordinates (for homelands or languages) is computed as
nearest point on land of the centroid of the convex hull for the set of coordinates.
"""
# We compute centroids with shapely in a projection-agnostic way. Thus, we have to make sure
# to deal with longitudes wrapping around at 180° - which only happens for subgroups of
# Austronesian.
def pos_lon(lon, tlgc):
return lon + 360 if lon < 0 and tlgc == 'aust1307' else lon
def norm_lon(lon, tlgc):
return lon - 360 if lon > 180 and tlgc == 'aust1307' else lon
subgroups = collections.defaultdict(list)
pref_continents = None
for lang in langs:
if not pref_continents:
pref_continents = {
'South America': ['southamerica', 'northaerica'],
'North America': ['northamerica', 'southamerica'],
'Eurasia': ['asia', 'europe'],
'Africa': ['africa'],
'Papunesia': ['oceania'],
'Australia': ['oceania'],
}[lang.macroareas[0].name]
tlgc = lang.lineage[0][1]
prev = None
for i, (_, gc, _) in enumerate(reversed(lang.lineage)):
if i == 0: # For the immediate parent, we append the coordinate.
subgroups[(tlgc, gc)].append((lang.latitude, lang.longitude))
else: # Otherwise we append the Glottocode of the immediate child.
subgroups[(tlgc, gc)].append(prev)
prev = gc
geod = pyproj.Geod(ellps='WGS84')
continents = _worlds_land_masses_dict()
homelands = {}
while subgroups:
for (tlgc, gc), coords in list(subgroups.items()):
# coords is a list of immediate children of the group specified by `gc`, either
# given as coordinates of languages or homelands or as glottocodes.
coords = [homelands.get(v, v) for v in coords]
if any(isinstance(v, str) for v in coords):
# There are still unresolved homelands for the immediate children. Defer
# computation until all homelands of children are resolved.
continue # pragma: no cover
# Compute the homeland from the homelands of the children.
homeland = MultiPoint(
[(pos_lon(p[1], tlgc), p[0]) for p in coords]).convex_hull.centroid
homeland = Point(norm_lon(homeland.x, tlgc), homeland.y)
for _, l in sorted(
continents.items(), key=lambda i: i[0] in pref_continents, reverse=True):
if l.contains(homeland):
break # pragma: no cover
else:
nps = [nearest_points(c, homeland)[0] for c in continents.values()]
nps = [(p, geodist(geod, (p.y, p.x), (homeland.y, homeland.x))) for p in nps]
homeland = sorted(nps, key=lambda n: n[1])[0][0]
homelands[gc] = (homeland.y, homeland.x)
del subgroups[(tlgc, gc)]
return homelands
def _l1_languages_with_coordinates(api):
invalid_macroareas = {
'atla1278': {api.macroareas.northamerica.name},
'aust1307': {api.macroareas.southamerica.name},
'indo1319': {
api.macroareas.northamerica.name,
api.macroareas.southamerica.name,
api.macroareas.africa.name,
api.macroareas.australia.name,
api.macroareas.pacific.name,
},
}
return [
lg for lg in api.languoids()
if lg.latitude is not None
and lg.lineage # noqa: W503
and lg.level == api.languoid_levels.language # noqa: W503
and lg.category == api.language_types.spoken_l1.category # noqa: W503
and ( # noqa: W503
(lg.lineage[0][1] not in invalid_macroareas) or # noqa: W504
(not invalid_macroareas[lg.lineage[0][1]].intersection(ma.name for ma in lg.macroareas))
)
]
if __name__ == '__main__': # pragma: no cover
from pyglottolog import Glottolog
import sys
gl = Glottolog(sys.argv[1], cache=True)
res = compute(gl, md)
print(len(res))