Source code for pyglottolog.languoids.models

import re
import typing
import collections
import urllib.parse

import attr
import markdown
import pycountry
from clldutils.misc import slug, nfilter
from clldutils import jsonlib
from dateutil import parser

from ..util import message
from ..config import AESSource, AES
from ..references import Entry

__all__ = [
    'Glottocode', 'Glottocodes',
    'Reference',
    'Country',
    'ClassificationComment',
    'ISORetirement',
    'Endangerment',
    'EthnologueComment',
    'Link',
]


@attr.s(hash=True)
class Link(object):
    url = attr.ib()  #:
    label = attr.ib(default=None)  #:

    @property
    def domain(self):
        return urllib.parse.urlparse(self.url).netloc

    @classmethod
    def from_string(cls, s):
        s = s.strip()
        if s.startswith('['):
            assert s.endswith(')') and '](' in s
            return cls(*reversed(s[1:-1].split('](')))
        return cls(s)

    @classmethod
    def from_(cls, obj):
        if isinstance(obj, cls):
            return obj
        if isinstance(obj, str):
            return cls.from_string(obj)
        if isinstance(obj, (list, tuple)) and len(obj) == 2:
            return cls(*obj)
        if isinstance(obj, dict):
            return cls(**obj)
        raise TypeError()

    def to_string(self):
        if self.label:
            return '[{0}]({1})'.format(self.label, self.url)
        return self.url

    def __json__(self):
        return attr.asdict(self)


[docs]class Glottocodes(object):
    """
    Registry keeping track of glottocodes that have been dealt out.
    """
    def __init__(self, fname):
        self._fname = fname
        self._store = jsonlib.load(self._fname)

    def __contains__(self, item):
        alpha, num = Glottocode(item).split()
        return alpha in self._store and num <= self._store[alpha]

    def __iter__(self):
        for alpha, num in self._store.items():
            for n in range(1234, num + 1):
                yield '{0}{1}'.format(alpha, n)

    def new(self, name, dry_run=False):
        alpha = slug(str(name))[:4]
        assert alpha
        while len(alpha) < 4:
            alpha += alpha[-1]
        num = self._store.get(alpha, 1233) + 1
        if not dry_run:
            self._store[alpha] = num
            # Store the updated dictionary of glottocodes back.
            ordered = collections.OrderedDict()
            for k in sorted(self._store.keys()):
                ordered[k] = self._store[k]
            jsonlib.dump(ordered, self._fname, indent=4)
        return Glottocode('%s%s' % (alpha, num))


class Glottocode(str):
    regex = r'[a-z0-9]{4}[0-9]{4}'
    pattern = re.compile(regex + r'$')

    def __new__(cls, content):
        if not cls.pattern.match(content):
            raise ValueError(content)
        return str.__new__(cls, content)

    def split(self):
        return self[:4], int(self[4:])


[docs]@attr.s
class Reference(object):
    """
    A reference of a bibliographical record in Glottolog.
    """
    key = attr.ib()  #:
    pages = attr.ib(default=None)  #:
    trigger = attr.ib(default=None)
    endtag = attr.ib(default='**')
    pattern = re.compile(
        r"\*\*(?P<key>[a-z0-9\-_]+:[a-zA-Z.?\-;*'/()\[\]!_:0-9\u2014]+?)(?P<endtag>\*\*|\(\*\*\))"
        r"(:(?P<pages>[0-9\-f]+))?"
        r'(<trigger "(?P<trigger>[^\"]+)">)?')
    old_pattern = re.compile(r'[^\[]+\[(?P<pages>[^\]]*)\]\s*\([0-9]+\s+(?P<key>[^\)]+)\)')

    def __str__(self):
        res = '**{0.key}**'.format(self)
        if self.pages:
            res += ':{0.pages}'.format(self)
        if self.trigger:
            res += '<trigger "{0.trigger}">'.format(self)
        return res

[docs]    def get_source(self, api) -> Entry:
        """
        Retrieve the referenced bibliographical record.
        """
        return api.bibfiles[self.bibname][self.bibkey]

    @property
    def provider(self):
        return self.key.split(':')[0]

    @property
    def bibname(self):
        return '{0}.bib'.format(self.provider)

    @property
    def bibkey(self):
        return self.key.split(':', 1)[1]

    @classmethod
    def from_match(cls, match):
        assert match
        return cls(**match.groupdict())

    @classmethod
    def from_string(cls, string, pattern=None):
        try:
            return cls.from_match((pattern or cls.pattern).match(string.strip()))
        except AssertionError:
            raise ValueError('Invalid reference: {0}'.format(string))

    @classmethod
    def from_list(cls, list_, pattern=None):
        res = []
        for s in list_:
            if s.strip():
                try:
                    res.append(cls.from_string(s, pattern=pattern))
                except AssertionError:  # pragma: no cover
                    raise ValueError('invalid ref: {0}'.format(s))
        return res


@attr.s
class Country(object):
    """
    Glottolog languoids can be related to the countries they are spoken in. These
    countries are identified by ISO 3166 Alpha-2 codes.

    .. see also:: https://en.wikipedia.org/wiki/ISO_3166-1
    """
    id = attr.ib()  #: ISO 3166 alpha 2 code
    name = attr.ib()  #: name

    def __str__(self):
        return self._format()

    def _format(self, minimal=True):
        tmpl = '{0.name} ({0.id})' if not minimal else '{0.id}'
        return tmpl.format(self)

    @classmethod
    def from_name(cls, name):
        res = pycountry.countries.get(name=name)
        if res:
            return cls(id=res.alpha_2, name=res.name)

    @classmethod
    def from_id(cls, id_):
        res = pycountry.countries.get(alpha_2=id_)
        if res:
            return cls(id=res.alpha_2, name=res.name)

    @classmethod
    def from_text(cls, text):
        match = re.search(r'(?P<code_only>^[A-Z]{2}$)|\(?(?P<code>[A-Z]{2})\)?', text)
        if match:
            code = match.group('code_only') or match.group('code')
            return cls.from_id(code)
        return cls.from_name(text)


[docs]@attr.s
class ClassificationComment(object):
    """
    Commentary on the classification of the languoid
    """
    #: Commentary on the internal classification of the descendants of the languoid
    sub = attr.ib(default=None)
    #: References for the internal classification
    subrefs: typing.List[Reference] = attr.ib(
        default=attr.Factory(list), converter=Reference.from_list)
    #: Commentary on the classification of the languoid within its family
    family = attr.ib(default=None)
    #: References for the family classification
    familyrefs: typing.List[Reference] = attr.ib(
        default=attr.Factory(list), converter=Reference.from_list)

    def merged_refs(self, type):
        assert type in ['sub', 'family']
        res = collections.defaultdict(set)
        for m in Reference.pattern.finditer(getattr(self, type) or ''):
            res[m.group('key')].add(m.group('pages'))
        for ref in getattr(self, type + 'refs'):
            res[ref.key].add(ref.pages)
        return [
            Reference(key=key, pages=';'.join(sorted(nfilter(pages))) or None)
            for key, pages in res.items()]

    def check(self, lang, keys, log):
        for attrib in ['subrefs', 'familyrefs']:
            for ref in getattr(self, attrib):
                if ref.key not in keys:
                    log.error(message(
                        lang, 'classification {0}: invalid bibkey: {1}'.format(attrib, ref.key)))

        for attrib in ['sub', 'family']:
            comment = getattr(self, attrib)
            if comment:
                for m in Reference.pattern.finditer(comment):
                    if m.group('key') not in keys:
                        log.error(message(
                            lang,
                            'classification {0}: invalid bibkey: {1}'.format(
                                attrib, m.group('key'))))
        return False


[docs]@attr.s
class ISORetirement(object):
    """
    Information extracted from accepted ISO 639-3 change requests about retired ISO codes
    associated with the languoid.
    """
    code = attr.ib(default=None)  #: Retired ISO 639-3 code
    name = attr.ib(default=None)  #: Name of the retired ISO language
    change_request = attr.ib(default=None)  #: Number of the ISO change request
    effective = attr.ib(default=None)  #: Date of acceptance of the change request
    reason = attr.ib(default=None)  #: Reason to retire the ISO code
    change_to = attr.ib(default=attr.Factory(list))  #: List of ISO codes replacing the retired code
    remedy = attr.ib(default=None)  #: What to do about the retired code
    comment = attr.ib(converter=lambda s: s.replace('\n.', '\n') if s else s, default=None)  #:

    def asdict(self):
        return attr.asdict(self)

    __json__ = asdict


[docs]@attr.s
class Endangerment(object):
    """
    Info about the endangerment status of the languoid
    """
    status: AES = attr.ib(validator=attr.validators.instance_of(AES))  #:
    source: AESSource = attr.ib(validator=attr.validators.instance_of(AESSource))  #:
    comment = attr.ib()  #:
    #: Date when the endangerment status was assessed
    date = attr.ib(converter=parser.parse)

    def __json__(self):
        res = attr.asdict(self, recurse=True)
        res['date'] = res['date'].isoformat().split('T')[0]
        return res


def valid_ethnologue_versions(inst, attr, value):
    pattern = re.compile(r'(E[1-9][0-9]|ISO 639-3)$')
    if not all(bool(pattern.match(x)) for x in value):  # pragma: no cover
        raise ValueError('invalid ethnologue_versions: {0}'.format('/'.join(value)))


def valid_comment_type(inst, attr, value):
    if value not in ['spurious', 'missing']:
        raise ValueError('invalid comment type: {0}'.format(value))


def valid_comment(inst, attr, value):
    if not value or not isinstance(value, str):
        raise ValueError(value)


[docs]@attr.s
class EthnologueComment(object):
    """
    Commentary about the classification of the languoid according to Ethnologue
    """
    # There's the isohid field which says which iso/hid the comment concerns.
    isohid = attr.ib()

    #: Either
    #:
    #: - "spurious" meaning the comment is to explain why the languoid in question is \
    #:   spurious and in which Ethnologue (as below) that is/was
    #: - "missing" meaning the comment is to explain why the languoid in question is \
    #:   missing (as a language entry) and in which Ethnologue (as below) that is/was
    comment_type = attr.ib(validator=valid_comment_type, converter=lambda s: s.lower())

    #: Which Ethnologue version(s)
    #: from E16-E19 the comment pertains to, joined by /:s. E.g. E16/E17. In the case of
    #: comment_type=spurious, E16/E17 in the version field means that the code was spurious
    #: in E16/E17 but no longer spurious in E18/E19. In the case of comment_type=missing,
    #: E16/E17 would mean that the code was missing from E16/E17, but present in E18/E19.
    #: If the comment concerns a language where versions would be the empty string,
    #: instead the string ISO 639-3 appears.
    ethnologue_versions = attr.ib(
        default='',
        validator=valid_ethnologue_versions,
        converter=lambda s: s.replace('693', '639').split('/'))
    comment = attr.ib(default=None, validator=valid_comment)  #:

    def __json__(self):
        return attr.asdict(self)

    def check(self, lang, keys, log):
        try:
            markdown.markdown(self.comment)
        except Exception as e:  # pragma: no cover
            log.error(message(lang, 'ethnologue comment: invalid markup: {0}'.format(e)))
        for m in Reference.pattern.finditer(self.comment):
            if m.group('key') not in keys:
                log.error(message(lang, 'ethnologue comment: invalid bibkey: {0}'.format(
                    m.group('key'))))
        return False