"""
An OO wrapper for languoid data access and manipulation.
"""
import os
import re
from typing import Union, Optional, TYPE_CHECKING, Any
import pathlib
import datetime
import warnings
import functools
import configparser
from collections.abc import Sequence, Generator, Iterable
from clldutils.inifile import INI
from newick import Node
from pyglottolog import config
from .models import (
Glottocode, Country, Reference, Endangerment, Link,
ClassificationComment, EthnologueComment, ISORetirement,
)
if TYPE_CHECKING: # pragma: no cover
from pyglottolog import Glottolog
__all__ = ['Languoid']
INFO_FILENAME = 'md.ini'
ISO_8601_INTERVAL = re.compile(
r'(?P<start_sign>[+-]?)'
r'(?P<start_date>\d{1,4}-\d{2}-\d{2})'
r'/'
r'(?P<end_sign>[+-]?)'
r'(?P<end_date>\d{1,4}-\d{2}-\d{2})',
flags=re.ASCII)
NameIdLevelType = tuple[str, Glottocode, config.LanguoidLevel]
[docs]@functools.total_ordering
class Languoid: # pylint: disable=too-many-public-methods
"""
Info on languoids is encoded in the INI files and in the directory hierarchy of
:attr:`pyglottolog.Glottolog.tree`.
This class provides access to all of it.
**Languoid formatting**:
:ivar _format_specs: A `dict` mapping custom format specifiers to conversion functions. Usage:
.. code-block:: python
>>> l = Languoid.from_name_id_level(pathlib.Path('.'), 'N(a,m)e', 'abcd1234', 'language')
>>> '{0:newick_name}'.format(l)
'N{a/m}e'
.. seealso::
`<https://www.python.org/dev/peps/pep-3101/#format-specifiers>`_ and
`<https://www.python.org/dev/peps/pep-3101/#controlling-formatting-on-a-per-type-basis>`_
"""
section_core = 'core'
def __init__( # pylint: disable=R0913,R0917
self,
cfg: INI,
lineage: Optional[list[tuple[str, str, str]]] = None,
id_: Optional[str] = None,
directory: Optional[pathlib.Path] = None,
tree: Optional[pathlib.Path] = None,
_api=None):
"""
Refer to the factory methods for typical use cases of instantiating a `Languoid`:
- :meth:`Languoid.from_dir`
- :meth:`Languoid.from_id_name_level`
:param cfg: `INI` instance storing the languoid's metadata.
:param lineage: list of ancestors (from root to this languoid).
:param id_: Glottocode for the languoid (or `None`, if `directory` is passed).
:param _api: Some properties require access to config data which is accessed through a \
`Glottolog` API instance.
"""
assert (id_ and tree) or directory
if id_ is None:
id_ = Glottocode(directory.name)
lineage = lineage or []
assert all(Glottocode.pattern.match(id) for _, id, _ in lineage)
self.lineage = [
(name, id, _api.languoid_levels.get(level) if _api else level)
for name, id, level in lineage]
self.cfg: INI = cfg
self.dir: pathlib.Path = directory or tree.joinpath(*[id for name, id, _ in self.lineage])
self._id: Glottocode = id_
self._api = _api
[docs] @classmethod
def from_dir(
cls,
directory: pathlib.Path,
nodes: Optional[dict[str, NameIdLevelType]] = None,
_api=None,
**kw):
"""
Create a `Languoid` from a directory, named with the Glottocode and containing `md.ini`.
This method is used by :class:`pyglottolog.Glottolog` to read `Languoid`s from the
repository's `languoids/tree` directory.
"""
if _api and _api.cache and directory.name in _api.cache:
return _api.cache[directory.name]
cfg = INI.from_file(directory.joinpath(INFO_FILENAME), interpolation=None)
if nodes is None:
nodes: dict[str, NameIdLevelType] = {}
lineage = []
for parent in directory.parents:
id_ = parent.name
assert id_ != directory.name
if not Glottocode.pattern.match(id_):
# we ignore leading non-languoid-dir path components.
break
if id_ not in nodes:
l_ = Languoid.from_dir(parent, nodes=nodes, _api=_api, **kw)
nodes[id_] = (l_.name, l_.id, l_.level)
lineage.append(nodes[id_])
res = cls(cfg, list(reversed(lineage)), directory=directory, _api=_api, **kw)
nodes[res.id] = (res.name, res.id, res.level)
return res
[docs] @classmethod
def from_name_id_level(cls, tree, name, id_, level, **kw):
"""
This method is used in `pyglottolog.lff` to instantiate `Languoid` s for new nodes
encountered in "lff"-format trees.
"""
cfg = INI(interpolation=None)
cfg.read_dict(dict(core=dict(name=name))) # pylint: disable=R1735
res = cls(cfg, kw.pop('lineage', []), id_=Glottocode(id_), tree=tree)
for k, v in kw.items():
setattr(res, k, v)
# Note: Setting the level behaves differently when `_api` is available, so must be done
# after all other attributes are initialized.
res.level = level
return res
# We provide a couple of node label format specifications which can be used when serializing
# trees in newick format.
_format_specs = {
'newick_name': (
lambda l_: l_.name.replace(
',', '/').replace('(', '{').replace(')', '}').replace("'", "''"),
"Languoid name with special newick characters replaced"),
'newick_level': (
lambda l_: '-l-' if getattr(l_.level, 'id', l_.level) == 'language' else '',
"Languoid level in case of languages"),
'newick_iso': (
lambda l_: f'[{l_.iso}]' if l_.iso else '',
"Bracketed ISO code or nothing"),
}
_newick_default_template = "'{l:newick_name} [{l.id}]{l:newick_iso}{l:newick_level}'"
def __format__(self, format_spec):
if format_spec in self._format_specs:
return self._format_specs[format_spec][0](self)
return object.__format__(self, format_spec)
def __hash__(self):
return hash(self.id)
def __eq__(self, other):
return self.id == other.id
def __lt__(self, other):
"""
To allow Languoid lists to be sorted, we implement a simple ordering by Glottocode.
"""
return self.id < other.id
def __repr__(self):
return f"<{getattr(self.level, 'name', self.level).capitalize()} {self.id}>"
def __str__(self):
return f'{self.name} [{self.id}]'
def _set(self, key, value, section=None):
section = section or self.section_core
if value is None and key in self.cfg[section]:
del self.cfg[section][key]
else:
self.cfg.set(section, key, value)
def _get(self, key, type_=None):
res = self.cfg.get(self.section_core, key, fallback=None)
if type_ and res:
return type_(res)
return res
[docs] def newick_node(
self,
nodes: Optional['LanguoidMapType'] = None,
template=None,
maxlevel=None,
level=0,
) -> Node:
"""
Return a `newick.Node` representing the subtree of the Glottolog classification starting
at the languoid.
:param template: Python format string accepting the `Languoid` instance as single \
variable named `l`, used to format node labels.
"""
template = template or self._newick_default_template
n = Node(name=template.format(l=self), length='1') # noqa: E741
children = self.children if nodes is None else self.children_from_nodemap(nodes)
for nn in sorted(children, key=lambda nn: nn.name):
if maxlevel:
if (isinstance(maxlevel, config.LanguoidLevel) and nn.level > maxlevel) or \
(not isinstance(maxlevel, config.LanguoidLevel) and level > maxlevel):
continue
n.add_descendant(
nn.newick_node(nodes=nodes, template=template, maxlevel=maxlevel, level=level + 1))
return n
[docs] def write_info(self, outdir: Optional[pathlib.Path] = None) -> pathlib.Path:
"""
Write `Languoid` metadata as INI file to `outdir/<INFO_FILENAME>`.
"""
outdir = outdir or self.dir
if not isinstance(outdir, pathlib.Path):
outdir = pathlib.Path(outdir)
if outdir.name != self.id:
outdir = outdir.joinpath(self.id)
if not outdir.exists():
outdir.mkdir()
fname = outdir.joinpath(INFO_FILENAME)
self.cfg.write(fname)
if os.linesep == '\n':
with fname.open(encoding='utf8') as fp:
text = fp.read()
with fname.open('w', encoding='utf8') as fp:
fp.write(text.replace('\n', '\r\n'))
return fname
# -------------------------------------------------------------------------
# Accessing info of a languoid
# -------------------------------------------------------------------------
@property
def glottocode(self) -> Glottocode: # pylint: disable=C0116
"""Alias for `id`"""
return self._id
@property
def id(self) -> Glottocode: # pylint: disable=C0116
return self._id
@property
def category(self) -> Optional[str]:
"""
Languoid category.
- Category name from :class:`pyglottolog.config.LanguageType` for languoids of level \
"language",
- `"Family"` or `"Pseudo Family"` for families,
- `"Dialect"` for dialects.
"""
# Computing the category requires access to config data:
if self._api:
pseudo_families = {
c.pseudo_family_id: c.category for c in self._api.language_types.values()}
fid = self.lineage[0][1] if self.lineage else None
if self.level == self._api.languoid_levels.language:
return pseudo_families.get(fid, self._api.language_types['spoken_l1'].category)
cat = self.level.name.capitalize()
if self.level == self._api.languoid_levels.family:
if self.id.startswith('unun9') or \
self.id in pseudo_families or fid in pseudo_families:
cat = 'Pseudo ' + cat
return cat
return None # pragma: no cover
@property
def isolate(self) -> bool:
"""
Flag signaling whether the languoid is an isolate, i.e. has level "language" and is not
member of a family.
"""
return getattr(self.level, 'id', self.level) == 'language' and not self.lineage
[docs] def children_from_nodemap(self, nodes: 'LanguoidMapType') -> list['Languoid']:
"""
A faster alternative to `children` when the relevant languoids have already been
read from disc.
"""
return [nodes[d.name] for d in self.dir.iterdir() if d.is_dir()]
[docs] def descendants_from_nodemap(self, nodes: 'LanguoidMapType', level=None) -> list['Languoid']:
"""
A faster alternative to `descendants` when the relevant languoids have already
been read from disc.
"""
if isinstance(level, str):
level = self._api.languoid_levels.get(level)
return [
n for n in nodes.values() if
n.lineage and self.id in [li[1] for li in n.lineage] and # noqa: W504
((level is None) or n.level == level)]
@property
def children(self) -> list['Languoid']:
"""
List of direct descendants of the languoid in the classification tree.
.. note::
Using this on many languoids can be slow, because the directory tree may be traversed
and INI files read multiple times. To circumvent this problem, you may use a read-only
:class:`pyglottolog.Glottolog` instance, by passing `cache=True` at initialization.
"""
return [Languoid.from_dir(d, _api=self._api) for d in self.dir.iterdir() if d.is_dir()]
[docs] def ancestors_from_nodemap(self, nodes: 'LanguoidMapType') -> list['Languoid']:
"""
A faster alternative to `ancestors` when the relevant languoids have already
been read from disc.
"""
return [nodes[lineage[1]] for lineage in self.lineage]
[docs] def iter_ancestors(self) -> Generator['Languoid', None, None]:
"""Yield ancestors going up the directory tree."""
for parent in self.dir.parents:
id_ = parent.name
if Glottocode.pattern.match(id_):
yield Languoid.from_dir(parent, _api=self._api)
else:
# we ignore leading non-languoid-dir path components.
break
def iter_descendants(self) -> Generator['Languoid', None, None]: # pylint: disable=C0116
for child in self.children:
yield child
yield from child.iter_descendants()
@property
def ancestors(self) -> list['Languoid']:
"""
List of ancestors of the languoid in the classification tree, from root (i.e. top-level
family) to parent node.
.. note::
Using this on many languoids can be slow, because the directory tree may be traversed
and INI files read multiple times. To circumvent this problem, you may use a read-only
:class:`pyglottolog.Glottolog` instance, by passing `cache=True` at initialization.
"""
return list(reversed(list(self.iter_ancestors())))
@property
def parent(self) -> Optional['Languoid']:
"""
Parent languoid or `None`.
.. note::
Using this on many languoids can be slow, because the directory tree may be traversed
and INI files read multiple times. To circumvent this problem, you may use a read-only
:class:`pyglottolog.Glottolog` instance, by passing `cache=True` at initialization.
"""
try:
return next(self.iter_ancestors())
except StopIteration:
return None
@property
def family(self) -> Optional['Languoid']:
"""
Top-level family the languoid belongs to or `None`.
.. note::
Using this on many languoids can be slow, because the directory tree may be traversed
and INI files read multiple times. To circumvent this problem, you may use a read-only
:class:`pyglottolog.Glottolog` instance, by passing `cache=True` at initialization.
"""
return self.ancestors[0] if self.lineage else None
@property
def names(self) -> dict[str, list]:
"""
A `dict` mapping alternative name providers to `list` s of alternative names for the
languoid by the given provider.
"""
if 'altnames' in self.cfg:
return {k: self.cfg.getlist('altnames', k) for k in self.cfg['altnames']}
return {}
[docs] def add_name(self, name: str, type_: str = 'glottolog'):
"""Add an alternative name."""
names = self.cfg.getlist('altnames', type_)
if name not in names:
self.cfg.set('altnames', type_, sorted(names + [name]))
[docs] def update_names(self, names: Iterable[str], type_: str = 'glottolog') -> bool:
"""Update alternative names of a specific type."""
new = set(names)
if new != set(self.cfg.getlist('altnames', type_)):
self.cfg.set('altnames', type_, sorted(new))
return True
return False
@property
def identifier(self) -> Union[dict, configparser.SectionProxy]:
"""Alternative identifiers of the languoid."""
if 'identifier' in self.cfg:
return self.cfg['identifier']
return {}
@property
def sources(self) -> list[Reference]:
"""
List of Glottolog references linked to the languoid
:rtype: :class:`pyglottolog.references.Reference`
"""
if self.cfg.has_option('sources', 'glottolog'):
return Reference.from_list(self.cfg.getlist('sources', 'glottolog'))
return []
@sources.setter
def sources(self, refs):
assert all(isinstance(r, Reference) for r in refs)
self.cfg.set('sources', 'glottolog', [f'{ref}' for ref in refs])
@property
def endangerment(self) -> Optional[Endangerment]:
"""
Endangerment information about the languoid.
:rtype: :class:`Endangerment`
"""
if ('endangerment' in self.cfg) and self._api:
kw: dict[str, Any] = dict(self.cfg['endangerment'].items())
kw['status'] = self._api.aes_status.get(kw['status'])
if kw['source'] in self._api.aes_sources:
kw['source'] = self._api.aes_sources[kw['source']]
else:
ref = Reference.from_string(kw['source'])
kw['source'] = config.AESSource(
id=ref.key,
name=None,
url=None,
reference_id=ref.key,
pages=ref.pages)
return Endangerment(**kw)
return None # pragma: no cover
@property
def classification_comment(self) -> Optional[ClassificationComment]:
"""
Classification information about the languoid.
:rtype: :class:`ClassificationComment`
"""
if 'classification' in self.cfg:
cfg = self.cfg['classification']
return ClassificationComment(
family=cfg.get('family'),
familyrefs=self.cfg.getlist('classification', 'familyrefs'),
sub=cfg.get('sub'),
subrefs=self.cfg.getlist('classification', 'subrefs'))
return None # pragma: no cover
@property
def ethnologue_comment(self) -> Optional[EthnologueComment]:
"""
Commentary about the classification of the languoid in Ethnologue.
:rtype: :class:`EthnologueComment`
"""
section = 'hh_ethnologue_comment'
if section in self.cfg:
return EthnologueComment(**self.cfg[section])
return None # pragma: no cover
@property
def macroareas(self) -> list[config.Macroarea]:
"""
:rtype: `list` of :class:`config.Macroarea`
"""
if self._api:
return [
self._api.macroareas.get(n)
for n in self.cfg.getlist(self.section_core, 'macroareas')]
return []
@macroareas.setter
def macroareas(self, value):
if self._api:
assert isinstance(value, (list, tuple)) \
and all(self._api.macroareas.get(n) for n in value)
self._set('macroareas', [ma.name for ma in value])
@property
def timespan(self) -> Optional[tuple[int, int]]:
"""Extinct languages are associated with a timespan"""
value = self.cfg.get(self.section_core, 'timespan', fallback=None)
if not value:
return None
value = value.strip()
ma = ISO_8601_INTERVAL.fullmatch(value)
if ma is None:
raise ValueError('invalid interval', value) # pragma: no cover
dates = ma.group('start_date', 'end_date')
def fix_date(d, year_tmpl='{:04d}'):
year, sep, rest = d.partition('-')
assert year and sep and rest
year = year_tmpl.format(int(year))
return f'{year}{sep}{rest}'
dates = map(fix_date, dates)
dates = [datetime.datetime.strptime(d, '%Y-%m-%d').date() for d in dates]
if any((d.month, d.day) != (1, 1) for d in dates): # pragma: no cover
warnings.warn(f'ignoring non -1-1 date(s) month/day: {dates!r}')
start, end = dates
return (
-start.year if ma.group('start_sign') == '-' else start.year,
-end.year if ma.group('end_sign') == '-' else end.year)
@timespan.setter
def timespan(self, value: Union[tuple[int, int], list[int]]):
if not (isinstance(value, (list, tuple)) and len(value) == 2):
raise ValueError(value)
# https://en.wikipedia.org/wiki/ISO_8601#Years
if not all(-9999 <= v <= 9999 for v in value):
warnings.warn(f'serializing year(s) outside the four-digit-range: {value!r}')
def fmt(v):
sign = '-' if v < 0 else ''
return f'{sign}{abs(v):04d}'
self._set('timespan',
'{}-01-01/{}-01-01'.format(*map(fmt, value))) # pylint: disable=C0209
@property
def links(self) -> list[Link]:
"""
Links to web resources related to the languoid
"""
return [Link.from_string(s) for s in self.cfg.getlist(self.section_core, 'links')]
@links.setter
def links(self, value):
assert isinstance(value, list)
self._set(
'links',
[v.to_string() for v in
sorted(
[Link.from_(v) for v in value], key=lambda l_: (l_.label or 'zzzz', l_.domain))])
[docs] def update_links(self, domain: str, urls: Iterable[str]) -> bool:
"""Update the links section of the languoid for a particular domain."""
new = [li for li in self.links if li.domain != domain] + [Link.from_(u) for u in urls]
if set(new) != set(self.links):
self.links = new
return True
return False
@property
def countries(self) -> list[Country]:
"""Countries a language is spoken in."""
return [Country.from_text(n)
for n in self.cfg.getlist(self.section_core, 'countries')]
@countries.setter
def countries(self, value: Sequence[Country]):
assert isinstance(value, (list, tuple)) \
and all(isinstance(o, Country) for o in value)
self._set('countries', [f'{c}' for c in value])
@property
def name(self) -> str:
"""
The Glottolog mame of the languoid
"""
return self._get('name')
@name.setter
def name(self, value):
self._set('name', value)
@property
def latitude(self) -> Optional[float]:
"""
The geographic latitude of the point chosen as representative coordinate of the languoid
"""
return self._get('latitude', float)
@latitude.setter
def latitude(self, value):
self._set('latitude', round(float(value), 5))
@property
def longitude(self) -> Optional[float]:
"""
The geographic longitude of the point chosen as representative coordinate of the languoid
"""
return self._get('longitude', float)
@longitude.setter
def longitude(self, value):
self._set('longitude', round(float(value), 5))
@property
def hid(self) -> Optional[str]:
"""The languoid's "H(arald)ID", aka a "NOCODE" code."""
return self._get('hid')
@hid.setter
def hid(self, value):
self._set('hid', value)
@property
def level(self) -> Optional[config.LanguoidLevel]: # pylint: disable=C0116
if self._api:
return self._get('level', self._api.languoid_levels.get)
return self._get('level', lambda s: s)
@level.setter
def level(self, value):
if self._api:
self._set('level', self._api.languoid_levels.get(value).id)
@property
def iso(self) -> Optional[str]: # pylint: disable=C0116
return self._get('iso639-3')
@iso.setter
def iso(self, value):
self._set('iso639-3', value)
@property
def iso_code(self) -> Optional[str]: # pylint: disable=C0116
return self._get('iso639-3')
@iso_code.setter
def iso_code(self, value):
self._set('iso639-3', value)
[docs] def closest_iso(
self,
api: Optional['Glottolog'] = None,
nodes: Optional['LanguoidMapType'] = None,
) -> Optional[str]:
"""
ISO 639-3 code assigned to the languoid or one of its ancestors in the classification
(in case of dialects) or `None`.
.. seealso:: https://github.com/glottolog/glottolog-cldf/issues/13
"""
api = api or self._api
assert api
if self.iso:
return self.iso
for _, gc, _ in reversed(self.lineage):
lg = nodes[gc] if nodes else api.languoid(gc)
if lg.iso:
return lg.iso
return None # pragma: no cover
@property
def iso_retirement(self) -> Optional[ISORetirement]:
"""Information about a retired ISO code related to the languoid."""
if 'iso_retirement' in self.cfg:
kw = dict(self.cfg['iso_retirement'])
if 'change_to' in kw:
kw['change_to'] = self.cfg.getlist('iso_retirement', 'change_to')
if 'comment' in kw:
kw['comment'] = self.cfg.gettext('iso_retirement', 'comment')
return ISORetirement(**kw)
return None # pragma: no cover
@property
def fname(self) -> pathlib.Path:
"""The location of the languoid's info file in the Glottolog tree directory."""
return self.dir.joinpath(INFO_FILENAME)
LanguoidMapType = dict[str, Languoid]