import re
import typing
import collections
import urllib.parse
import attr
import markdown
import pycountry
from clldutils.misc import slug, nfilter
from clldutils import jsonlib
from dateutil import parser
from ..util import message
from ..config import AESSource, AES
from ..references import Entry
__all__ = [
'Glottocode', 'Glottocodes',
'Reference',
'Country',
'ClassificationComment',
'ISORetirement',
'Endangerment',
'EthnologueComment',
'Link',
]
@attr.s(hash=True)
class Link(object):
url = attr.ib() #:
label = attr.ib(default=None) #:
@property
def domain(self):
return urllib.parse.urlparse(self.url).netloc
@classmethod
def from_string(cls, s):
s = s.strip()
if s.startswith('['):
assert s.endswith(')') and '](' in s
return cls(*reversed(s[1:-1].split('](')))
return cls(s)
@classmethod
def from_(cls, obj):
if isinstance(obj, cls):
return obj
if isinstance(obj, str):
return cls.from_string(obj)
if isinstance(obj, (list, tuple)) and len(obj) == 2:
return cls(*obj)
if isinstance(obj, dict):
return cls(**obj)
raise TypeError()
def to_string(self):
if self.label:
return '[{0}]({1})'.format(self.label, self.url)
return self.url
def __json__(self):
return attr.asdict(self)
[docs]class Glottocodes(object):
"""
Registry keeping track of glottocodes that have been dealt out.
"""
def __init__(self, fname):
self._fname = fname
self._store = jsonlib.load(self._fname)
def __contains__(self, item):
alpha, num = Glottocode(item).split()
return alpha in self._store and num <= self._store[alpha]
def __iter__(self):
for alpha, num in self._store.items():
for n in range(1234, num + 1):
yield '{0}{1}'.format(alpha, n)
def new(self, name, dry_run=False):
alpha = slug(str(name))[:4]
assert alpha
while len(alpha) < 4:
alpha += alpha[-1]
num = self._store.get(alpha, 1233) + 1
if not dry_run:
self._store[alpha] = num
# Store the updated dictionary of glottocodes back.
ordered = collections.OrderedDict()
for k in sorted(self._store.keys()):
ordered[k] = self._store[k]
jsonlib.dump(ordered, self._fname, indent=4)
return Glottocode('%s%s' % (alpha, num))
class Glottocode(str):
regex = r'[a-z0-9]{4}[0-9]{4}'
pattern = re.compile(regex + r'$')
def __new__(cls, content):
if not cls.pattern.match(content):
raise ValueError(content)
return str.__new__(cls, content)
def split(self):
return self[:4], int(self[4:])
[docs]@attr.s
class Reference(object):
"""
A reference of a bibliographical record in Glottolog.
"""
key = attr.ib() #:
pages = attr.ib(default=None) #:
trigger = attr.ib(default=None)
endtag = attr.ib(default='**')
pattern = re.compile(
r"\*\*(?P<key>[a-z0-9\-_]+:[a-zA-Z.?\-;*'/()\[\]!_:0-9\u2014]+?)(?P<endtag>\*\*|\(\*\*\))"
r"(:(?P<pages>[0-9\-f]+))?"
r'(<trigger "(?P<trigger>[^\"]+)">)?')
old_pattern = re.compile(r'[^\[]+\[(?P<pages>[^\]]*)\]\s*\([0-9]+\s+(?P<key>[^\)]+)\)')
def __str__(self):
res = '**{0.key}**'.format(self)
if self.pages:
res += ':{0.pages}'.format(self)
if self.trigger:
res += '<trigger "{0.trigger}">'.format(self)
return res
[docs] def get_source(self, api) -> Entry:
"""
Retrieve the referenced bibliographical record.
"""
return api.bibfiles[self.bibname][self.bibkey]
@property
def provider(self):
return self.key.split(':')[0]
@property
def bibname(self):
return '{0}.bib'.format(self.provider)
@property
def bibkey(self):
return self.key.split(':', 1)[1]
@classmethod
def from_match(cls, match):
assert match
return cls(**match.groupdict())
@classmethod
def from_string(cls, string, pattern=None):
try:
return cls.from_match((pattern or cls.pattern).match(string.strip()))
except AssertionError:
raise ValueError('Invalid reference: {0}'.format(string))
@classmethod
def from_list(cls, list_, pattern=None):
res = []
for s in list_:
if s.strip():
try:
res.append(cls.from_string(s, pattern=pattern))
except AssertionError: # pragma: no cover
raise ValueError('invalid ref: {0}'.format(s))
return res
@attr.s
class Country(object):
"""
Glottolog languoids can be related to the countries they are spoken in. These
countries are identified by ISO 3166 Alpha-2 codes.
.. see also:: https://en.wikipedia.org/wiki/ISO_3166-1
"""
id = attr.ib() #: ISO 3166 alpha 2 code
name = attr.ib() #: name
def __str__(self):
return self._format()
def _format(self, minimal=True):
tmpl = '{0.name} ({0.id})' if not minimal else '{0.id}'
return tmpl.format(self)
@classmethod
def from_name(cls, name):
res = pycountry.countries.get(name=name)
if res:
return cls(id=res.alpha_2, name=res.name)
@classmethod
def from_id(cls, id_):
res = pycountry.countries.get(alpha_2=id_)
if res:
return cls(id=res.alpha_2, name=res.name)
@classmethod
def from_text(cls, text):
match = re.search(r'(?P<code_only>^[A-Z]{2}$)|\(?(?P<code>[A-Z]{2})\)?', text)
if match:
code = match.group('code_only') or match.group('code')
return cls.from_id(code)
return cls.from_name(text)
[docs]@attr.s
class ISORetirement(object):
"""
Information extracted from accepted ISO 639-3 change requests about retired ISO codes
associated with the languoid.
"""
code = attr.ib(default=None) #: Retired ISO 639-3 code
name = attr.ib(default=None) #: Name of the retired ISO language
change_request = attr.ib(default=None) #: Number of the ISO change request
effective = attr.ib(default=None) #: Date of acceptance of the change request
reason = attr.ib(default=None) #: Reason to retire the ISO code
change_to = attr.ib(default=attr.Factory(list)) #: List of ISO codes replacing the retired code
remedy = attr.ib(default=None) #: What to do about the retired code
comment = attr.ib(converter=lambda s: s.replace('\n.', '\n') if s else s, default=None) #:
def asdict(self):
return attr.asdict(self)
__json__ = asdict
[docs]@attr.s
class Endangerment(object):
"""
Info about the endangerment status of the languoid
"""
status: AES = attr.ib(validator=attr.validators.instance_of(AES)) #:
source: AESSource = attr.ib(validator=attr.validators.instance_of(AESSource)) #:
comment = attr.ib() #:
#: Date when the endangerment status was assessed
date = attr.ib(converter=parser.parse)
def __json__(self):
res = attr.asdict(self, recurse=True)
res['date'] = res['date'].isoformat().split('T')[0]
return res
def valid_ethnologue_versions(inst, attr, value):
pattern = re.compile(r'(E[1-9][0-9]|ISO 639-3)$')
if not all(bool(pattern.match(x)) for x in value): # pragma: no cover
raise ValueError('invalid ethnologue_versions: {0}'.format('/'.join(value)))
def valid_comment_type(inst, attr, value):
if value not in ['spurious', 'missing']:
raise ValueError('invalid comment type: {0}'.format(value))
def valid_comment(inst, attr, value):
if not value or not isinstance(value, str):
raise ValueError(value)