Source code for pyglottolog.languoids.languoid

import os
import re
import typing
import pathlib
import datetime
import warnings
import functools
import configparser

from clldutils.inifile import INI
from newick import Node

from .models import (
    Glottocode, Country, Reference, Endangerment, Link,
    ClassificationComment, EthnologueComment, ISORetirement,
)
from pyglottolog import config

__all__ = ['Languoid']

INFO_FILENAME = 'md.ini'

ISO_8601_INTERVAL = re.compile(
    r'(?P<start_sign>[+-]?)'
    r'(?P<start_date>\d{1,4}-\d{2}-\d{2})'
    r'/'
    r'(?P<end_sign>[+-]?)'
    r'(?P<end_date>\d{1,4}-\d{2}-\d{2})',
    flags=re.ASCII)


[docs]@functools.total_ordering class Languoid(object): """ Info on languoids is encoded in the INI files and in the directory hierarchy of :attr:`pyglottolog.Glottolog.tree`. This class provides access to all of it. **Languoid formatting**: :ivar _format_specs: A `dict` mapping custom format specifiers to conversion functions. Usage: .. code-block:: python >>> l = Languoid.from_name_id_level(pathlib.Path('.'), 'N(a,m)e', 'abcd1234', 'language') >>> '{0:newick_name}'.format(l) 'N{a/m}e' .. seealso:: `<https://www.python.org/dev/peps/pep-3101/#format-specifiers>`_ and `<https://www.python.org/dev/peps/pep-3101/#controlling-formatting-on-a-per-type-basis>`_ """ section_core = 'core' def __init__( self, cfg: INI, lineage: typing.Union[None, typing.List[typing.Tuple[str, str, str]]] = None, id_: typing.Union[None, str] = None, directory: typing.Union[None, pathlib.Path] = None, tree: typing.Union[None, pathlib.Path] = None, _api=None): """ Refer to the factory methods for typical use cases of instantiating a `Languoid`: - :meth:`Languoid.from_dir` - :meth:`Languoid.from_id_name_level` :param cfg: `INI` instance storing the languoid's metadata. :param lineage: list of ancestors (from root to this languoid). :param id_: Glottocode for the languoid (or `None`, if `directory` is passed). :param _api: Some properties require access to config data which is accessed through a \ `Glottolog` API instance. """ assert (id_ and tree) or directory if id_ is None: id_ = Glottocode(directory.name) lineage = lineage or [] assert all(Glottocode.pattern.match(id) for _, id, _ in lineage) self.lineage = [ (name, id, _api.languoid_levels.get(level) if _api else level) for name, id, level in lineage] self.cfg = cfg self.dir = directory or tree.joinpath(*[id for name, id, _ in self.lineage]) self._id = id_ self._api = _api
[docs] @classmethod def from_dir(cls, directory: pathlib.Path, nodes=None, _api=None, **kw): """ Create a `Languoid` from a directory, named with the Glottocode and containing `md.ini`. This method is used by :class:`pyglottolog.Glottolog` to read `Languoid`s from the repository's `languoids/tree` directory. """ if _api and _api.cache and directory.name in _api.cache: return _api.cache[directory.name] if nodes is None: nodes = {} cfg = INI.from_file(directory.joinpath(INFO_FILENAME), interpolation=None) lineage = [] for parent in directory.parents: id_ = parent.name assert id_ != directory.name if not Glottocode.pattern.match(id_): # we ignore leading non-languoid-dir path components. break if id_ not in nodes: l_ = Languoid.from_dir(parent, nodes=nodes, _api=_api, **kw) nodes[id_] = (l_.name, l_.id, l_.level) lineage.append(nodes[id_]) res = cls(cfg, list(reversed(lineage)), directory=directory, _api=_api, **kw) nodes[res.id] = (res.name, res.id, res.level) return res
[docs] @classmethod def from_name_id_level(cls, tree, name, id, level, **kw): """ This method is used in `pyglottolog.lff` to instantiate `Languoid` s for new nodes encountered in "lff"-format trees. """ cfg = INI(interpolation=None) cfg.read_dict(dict(core=dict(name=name))) res = cls(cfg, kw.pop('lineage', []), id_=Glottocode(id), tree=tree) for k, v in kw.items(): setattr(res, k, v) # Note: Setting the level behaves differently when `_api` is available, so must be done # after all other attributes are initialized. res.level = level return res
# We provide a couple of node label format specifications which can be used when serializing # trees in newick format. _format_specs = { 'newick_name': ( lambda l_: l_.name.replace( ',', '/').replace('(', '{').replace(')', '}').replace("'", "''"), "Languoid name with special newick characters replaced"), 'newick_level': ( lambda l_: '-l-' if getattr(l_.level, 'id', l_.level) == 'language' else '', "Languoid level in case of languages"), 'newick_iso': ( lambda l_: '[{0}]'.format(l_.iso) if l_.iso else '', "Bracketed ISO code or nothing"), } _newick_default_template = "'{l:newick_name} [{l.id}]{l:newick_iso}{l:newick_level}'" def __format__(self, format_spec): if format_spec in self._format_specs: return self._format_specs[format_spec][0](self) return object.__format__(self, format_spec) def __hash__(self): return hash(self.id) def __eq__(self, other): return self.id == other.id def __lt__(self, other): """ To allow Languoid lists to be sorted, we implement a simple ordering by Glottocode. """ return self.id < other.id def __repr__(self): return '<%s %s>' % (getattr(self.level, 'name', self.level).capitalize(), self.id) def __str__(self): return '%s [%s]' % (self.name, self.id) def _set(self, key, value, section=None): section = section or self.section_core if value is None and key in self.cfg[section]: del self.cfg[section][key] else: self.cfg.set(section, key, value) def _get(self, key, type_=None): res = self.cfg.get(self.section_core, key, fallback=None) if type_ and res: return type_(res) return res
[docs] def newick_node(self, nodes=None, template=None, maxlevel=None, level=0) -> Node: """ Return a `newick.Node` representing the subtree of the Glottolog classification starting at the languoid. :param template: Python format string accepting the `Languoid` instance as single \ variable named `l`, used to format node labels. """ template = template or self._newick_default_template n = Node(name=template.format(l=self), length='1') # noqa: E741 children = self.children if nodes is None else self.children_from_nodemap(nodes) for nn in sorted(children, key=lambda nn: nn.name): if maxlevel: if (isinstance(maxlevel, config.LanguoidLevel) and nn.level > maxlevel) or \ (not isinstance(maxlevel, config.LanguoidLevel) and level > maxlevel): continue n.add_descendant( nn.newick_node(nodes=nodes, template=template, maxlevel=maxlevel, level=level + 1)) return n
[docs] def write_info(self, outdir: typing.Union[None, pathlib.Path] = None): """ Write `Languoid` metadata as INI file to `outdir/<INFO_FILENAME>`. """ outdir = outdir or self.dir if not isinstance(outdir, pathlib.Path): outdir = pathlib.Path(outdir) if outdir.name != self.id: outdir = outdir.joinpath(self.id) if not outdir.exists(): outdir.mkdir() fname = outdir.joinpath(INFO_FILENAME) self.cfg.write(fname) if os.linesep == '\n': with fname.open(encoding='utf8') as fp: text = fp.read() with fname.open('w', encoding='utf8') as fp: fp.write(text.replace('\n', '\r\n')) return fname
# ------------------------------------------------------------------------- # Accessing info of a languoid # ------------------------------------------------------------------------- @property def glottocode(self): """Alias for `id`""" return self._id @property def id(self): return self._id @property def category(self): """ Languoid category. - Category name from :class:`pyglottolog.config.LanguageType` for languoids of level \ "language", - `"Family"` or `"Pseudo Family"` for families, - `"Dialect"` for dialects. """ # Computing the category requires access to config data: if self._api: pseudo_families = { c.pseudo_family_id: c.category for c in self._api.language_types.values()} fid = self.lineage[0][1] if self.lineage else None if self.level == self._api.languoid_levels.language: return pseudo_families.get(fid, self._api.language_types['spoken_l1'].category) cat = self.level.name.capitalize() if self.level == self._api.languoid_levels.family: if self.id.startswith('unun9') or \ self.id in pseudo_families or fid in pseudo_families: cat = 'Pseudo ' + cat return cat @property def isolate(self) -> bool: """ Flag signaling whether the languoid is an isolate, i.e. has level "language" and is not member of a family. """ return getattr(self.level, 'id', self.level) == 'language' and not self.lineage def children_from_nodemap(self, nodes): # A faster alternative to `children` when the relevant languoids have already been # read from disc. return [nodes[d.name] for d in self.dir.iterdir() if d.is_dir()] def descendants_from_nodemap(self, nodes, level=None): if isinstance(level, str): level = self._api.languoid_levels.get(level) return [ n for n in nodes.values() if n.lineage and self.id in [li[1] for li in n.lineage] and # noqa: W504 ((level is None) or n.level == level)] @property def children(self) -> typing.List['Languoid']: """ List of direct descendants of the languoid in the classification tree. .. note:: Using this on many languoids can be slow, because the directory tree may be traversed and INI files read multiple times. To circumvent this problem, you may use a read-only :class:`pyglottolog.Glottolog` instance, by passing `cache=True` at initialization. """ return [Languoid.from_dir(d, _api=self._api) for d in self.dir.iterdir() if d.is_dir()] def ancestors_from_nodemap(self, nodes): # A faster alternative to `ancestors` when the relevant languoids have already # been read from disc. return [nodes[lineage[1]] for lineage in self.lineage] def iter_ancestors(self): for parent in self.dir.parents: id_ = parent.name if Glottocode.pattern.match(id_): yield Languoid.from_dir(parent, _api=self._api) else: # we ignore leading non-languoid-dir path components. break def iter_descendants(self): for child in self.children: yield child yield from child.iter_descendants() @property def ancestors(self) -> typing.List['Languoid']: """ List of ancestors of the languoid in the classification tree, from root (i.e. top-level family) to parent node. .. note:: Using this on many languoids can be slow, because the directory tree may be traversed and INI files read multiple times. To circumvent this problem, you may use a read-only :class:`pyglottolog.Glottolog` instance, by passing `cache=True` at initialization. """ return list(reversed(list(self.iter_ancestors()))) @property def parent(self) -> typing.Union['Languoid', None]: """ Parent languoid or `None`. .. note:: Using this on many languoids can be slow, because the directory tree may be traversed and INI files read multiple times. To circumvent this problem, you may use a read-only :class:`pyglottolog.Glottolog` instance, by passing `cache=True` at initialization. """ try: return next(self.iter_ancestors()) except StopIteration: return @property def family(self) -> typing.Union['Languoid', None]: """ Top-level family the languoid belongs to or `None`. .. note:: Using this on many languoids can be slow, because the directory tree may be traversed and INI files read multiple times. To circumvent this problem, you may use a read-only :class:`pyglottolog.Glottolog` instance, by passing `cache=True` at initialization. """ return self.ancestors[0] if self.lineage else None @property def names(self) -> typing.Dict[str, list]: """ A `dict` mapping alternative name providers to `list` s of alternative names for the languoid by the given provider. """ if 'altnames' in self.cfg: return {k: self.cfg.getlist('altnames', k) for k in self.cfg['altnames']} return {} def add_name(self, name, type_='glottolog'): names = self.cfg.getlist('altnames', type_) if name not in names: self.cfg.set('altnames', type_, sorted(names + [name])) def update_names(self, names, type_='glottolog'): new = set(names) if new != set(self.cfg.getlist('altnames', type_)): self.cfg.set('altnames', type_, sorted(new)) return True return False @property def identifier(self) -> typing.Union[dict, configparser.SectionProxy]: if 'identifier' in self.cfg: return self.cfg['identifier'] return {} @property def sources(self) -> typing.List[Reference]: """ List of Glottolog references linked to the languoid :rtype: :class:`pyglottolog.references.Reference` """ if self.cfg.has_option('sources', 'glottolog'): return Reference.from_list(self.cfg.getlist('sources', 'glottolog')) return [] @sources.setter def sources(self, refs): assert all(isinstance(r, Reference) for r in refs) self.cfg.set('sources', 'glottolog', ['{0}'.format(ref) for ref in refs]) @property def endangerment(self) -> typing.Union[None, Endangerment]: """ Endangerment information about the languoid. :rtype: :class:`Endangerment` """ if ('endangerment' in self.cfg) and self._api: kw = {k: v for k, v in self.cfg['endangerment'].items()} kw['status'] = self._api.aes_status.get(kw['status']) if kw['source'] in self._api.aes_sources: kw['source'] = self._api.aes_sources[kw['source']] else: ref = Reference.from_string(kw['source']) kw['source'] = config.AESSource( id=ref.key, name=None, url=None, reference_id=ref.key, pages=ref.pages) return Endangerment(**kw) @property def classification_comment(self) -> typing.Union[None, ClassificationComment]: """ Classification information about the languoid. :rtype: :class:`ClassificationComment` """ if 'classification' in self.cfg: cfg = self.cfg['classification'] return ClassificationComment( family=cfg.get('family'), familyrefs=self.cfg.getlist('classification', 'familyrefs'), sub=cfg.get('sub'), subrefs=self.cfg.getlist('classification', 'subrefs')) @property def ethnologue_comment(self) -> typing.Union[None, EthnologueComment]: """ Commentary about the classification of the languoid in Ethnologue. :rtype: :class:`EthnologueComment` """ section = 'hh_ethnologue_comment' if section in self.cfg: return EthnologueComment(**self.cfg[section]) @property def macroareas(self) -> typing.List[config.Macroarea]: """ :rtype: `list` of :class:`config.Macroarea` """ if self._api: return [ self._api.macroareas.get(n) for n in self.cfg.getlist(self.section_core, 'macroareas')] return [] @macroareas.setter def macroareas(self, value): if self._api: assert isinstance(value, (list, tuple)) \ and all(self._api.macroareas.get(n) for n in value) self._set('macroareas', [ma.name for ma in value]) @property def timespan(self, _date_format='%Y-%m-%d'): value = self.cfg.get(self.section_core, 'timespan', fallback=None) if not value: return None value = value.strip() ma = ISO_8601_INTERVAL.fullmatch(value) if ma is None: raise ValueError('invalid interval', value) # pragma: no cover dates = ma.group('start_date', 'end_date') def fix_date(d, year_tmpl='{:04d}'): year, sep, rest = d.partition('-') assert year and sep and rest year = year_tmpl.format(int(year)) return '{}{}{}'.format(year, sep, rest) dates = map(fix_date, dates) dates = [datetime.datetime.strptime(d, _date_format).date() for d in dates] if any((d.month, d.day) != (1, 1) for d in dates): # pragma: no cover warnings.warn('ignoring non -1-1 date(s) month/day: {!r}'.format(dates)) start, end = dates return ( -start.year if ma.group('start_sign') == '-' else start.year, -end.year if ma.group('end_sign') == '-' else end.year) @timespan.setter def timespan(self, value): if not (isinstance(value, (list, tuple)) and len(value) == 2): raise ValueError(value) # https://en.wikipedia.org/wiki/ISO_8601#Years if not all(-9999 <= v <= 9999 for v in value): warnings.warn('serializing year(s) outside the four-digit-range: {!r}'.format(value)) def fmt(v): sign = '-' if v < 0 else '' return '{}{:04d}'.format(sign, abs(v)) self._set('timespan', '{}-01-01/{}-01-01'.format(*map(fmt, value))) @property def links(self) -> typing.List[Link]: """ Links to web resources related to the languoid """ return [Link.from_string(s) for s in self.cfg.getlist(self.section_core, 'links')] @links.setter def links(self, value): assert isinstance(value, list) self._set( 'links', [v.to_string() for v in sorted( [Link.from_(v) for v in value], key=lambda l_: (l_.label or 'zzzz', l_.domain))]) def update_links(self, domain, urls): new = [li for li in self.links if li.domain != domain] + [Link.from_(u) for u in urls] if set(new) != set(self.links): self.links = new return True return False @property def countries(self) -> typing.List[Country]: """ Countries a language is spoken in. """ return [Country.from_text(n) for n in self.cfg.getlist(self.section_core, 'countries')] @countries.setter def countries(self, value): assert isinstance(value, (list, tuple)) \ and all(isinstance(o, Country) for o in value) self._set('countries', ['{0}'.format(c) for c in value]) @property def name(self): """ The Glottolog mame of the languoid """ return self._get('name') @name.setter def name(self, value): self._set('name', value) @property def latitude(self) -> typing.Union[None, float]: """ The geographic latitude of the point chosen as representative coordinate of the languoid """ return self._get('latitude', float) @latitude.setter def latitude(self, value): self._set('latitude', round(float(value), 5)) @property def longitude(self) -> typing.Union[None, float]: """ The geographic longitude of the point chosen as representative coordinate of the languoid """ return self._get('longitude', float) @longitude.setter def longitude(self, value): self._set('longitude', round(float(value), 5)) @property def hid(self): return self._get('hid') @hid.setter def hid(self, value): self._set('hid', value) @property def level(self): if self._api: return self._get('level', self._api.languoid_levels.get) return self._get('level', lambda s: s) @level.setter def level(self, value): if self._api: self._set('level', self._api.languoid_levels.get(value).id) @property def iso(self): return self._get('iso639-3') @iso.setter def iso(self, value): self._set('iso639-3', value) @property def iso_code(self): return self._get('iso639-3') @iso_code.setter def iso_code(self, value): self._set('iso639-3', value)
[docs] def closest_iso(self, api=None, nodes=None) -> typing.Union[str, None]: """ ISO 639-3 code assigned to the languoid or one of its ancestors in the classification (in case of dialects) or `None`. .. seealso:: https://github.com/glottolog/glottolog-cldf/issues/13 """ api = api or self._api assert api if self.iso: return self.iso for _, gc, _ in reversed(self.lineage): lg = nodes[gc] if nodes else api.languoid(gc) if lg.iso: return lg.iso
@property def iso_retirement(self): if 'iso_retirement' in self.cfg: kw = dict(self.cfg['iso_retirement']) if 'change_to' in kw: kw['change_to'] = self.cfg.getlist('iso_retirement', 'change_to') if 'comment' in kw: kw['comment'] = self.cfg.gettext('iso_retirement', 'comment') return ISORetirement(**kw) @property def fname(self): return self.dir.joinpath(INFO_FILENAME)