"""
OO wrappers for various Glottolog data objects.
"""
import datetime
import logging
import re
from typing import Optional, Literal, get_args, TYPE_CHECKING, Union
import collections
from collections.abc import Iterable
import dataclasses
import urllib.parse
import markdown
import pycountry
from clldutils.misc import slug, nfilter
from clldutils import jsonlib
from clldutils.markup import MarkdownLink
from dateutil import parser
from ..util import message
from ..config import AESSource, AES
from ..references import Entry
if TYPE_CHECKING: # pragma: no cover
from . import Languoid
__all__ = [
'Glottocode', 'Glottocodes',
'Reference',
'Country',
'ClassificationComment',
'ISORetirement',
'Endangerment',
'EthnologueComment',
'Link',
]
@dataclasses.dataclass(eq=True, frozen=True)
class Link:
"""Links appear in multiple places in languoid ini files."""
url: str
label: str = None
@property
def domain(self) -> str:
"""The domain part of the URL, aka netloc."""
return urllib.parse.urlparse(self.url).netloc
@classmethod
def from_string(cls, s: str) -> 'Link':
"""Simplistic parsing of links from markdown formatting."""
s = s.strip()
if s.startswith('['):
assert s.endswith(')') and '](' in s
return cls(*reversed(s[1:-1].split('](')))
return cls(s)
@classmethod
def from_(cls, obj: Union['Link', str, list, tuple, dict]) -> 'Link':
"""Instantiate link from various input types."""
if isinstance(obj, cls):
return obj
if isinstance(obj, str):
return cls.from_string(obj)
if isinstance(obj, (list, tuple)) and len(obj) == 2:
return cls(*obj)
if isinstance(obj, dict):
return cls(**obj)
raise TypeError()
def to_string(self) -> str:
"""Link formatted as markdown."""
if self.label:
return f'[{self.label}]({self.url})'
return self.url
def __json__(self):
return dataclasses.asdict(self)
[docs]class Glottocodes:
"""
Registry keeping track of glottocodes that have been dealt out.
"""
def __init__(self, fname):
self._fname = fname
self._store = jsonlib.load(self._fname)
def __contains__(self, item):
alpha, num = Glottocode(item).split()
return alpha in self._store and num <= self._store[alpha]
def __iter__(self):
for alpha, num in self._store.items():
for n in range(1234, num + 1):
yield f'{alpha}{n}'
[docs] def new(self, name, dry_run=False) -> 'Glottocode':
"""Mint a new Glottocode"""
alpha = slug(str(name))[:4]
assert alpha
while len(alpha) < 4:
alpha += alpha[-1]
num = self._store.get(alpha, 1233) + 1
if not dry_run:
self._store[alpha] = num
# Store the updated dictionary of glottocodes back.
ordered = collections.OrderedDict()
for k in sorted(self._store.keys()):
ordered[k] = self._store[k]
jsonlib.dump(ordered, self._fname, indent=4)
return Glottocode(f'{alpha}{num}')
class Glottocode(str):
"""Glottocodes are special string."""
regex = r'[a-z0-9]{4}[0-9]{4}'
pattern = re.compile(regex + r'$')
def __new__(cls, content):
if not cls.pattern.match(content):
raise ValueError(content)
return str.__new__(cls, content)
def split(self) -> tuple[str, int]:
"""Split the Glottocode into alpha-numeric and numeric part."""
return self[:4], int(self[4:])
[docs]@dataclasses.dataclass
class Reference:
"""
A reference of a bibliographical record in Glottolog.
"""
key: str
pages: Optional[str] = None
trigger: Optional[str] = None
endtag: str = '**'
pattern: re.Pattern = re.compile(
r"\*\*(?P<key>[a-z0-9\-_]+:[a-zA-Z.?\-;*'/()\[\]!_:0-9\u2014]+?)(?P<endtag>\*\*|\(\*\*\))"
r"(:(?P<pages>[0-9\-f]+))?"
r'(<trigger "(?P<trigger>[^\"]+)">)?')
old_pattern: re.Pattern = re.compile(r'[^\[]+\[(?P<pages>[^]]*)]\s*\([0-9]+\s+(?P<key>[^)]+)\)')
def __str__(self):
res = f'**{self.key}**'
if self.pages:
res += f':{self.pages}'
if self.trigger:
res += f'<trigger "{self.trigger}">'
return res
[docs] def get_source(self, api) -> Entry:
"""
Retrieve the referenced bibliographical record.
"""
return api.bibfiles[self.bibname][self.bibkey]
@property
def provider(self) -> str:
"""The provider id."""
return self.key.split(':')[0]
@property
def bibname(self) -> str:
"""The name of the bibtex file."""
return f'{self.provider}.bib'
@property
def bibkey(self) -> str:
"""The local bibtex key in the bib."""
return self.key.split(':', 1)[1]
[docs] @classmethod
def from_match(cls, match: re.Match) -> 'Reference':
"""Instantiate a reference from a regex match."""
assert match
return cls(**match.groupdict())
[docs] @classmethod
def from_string(cls, string: str, pattern: Optional[re.Pattern] = None) -> 'Reference':
"""Parse a reference from a string."""
try:
return cls.from_match((pattern or cls.pattern).match(string.strip()))
except AssertionError as e:
raise ValueError(f'Invalid reference: {string}') from e
[docs] @classmethod
def from_list(
cls,
list_: Iterable[Union['Reference', str]],
pattern: Optional[re.Pattern] = None,
) -> list['Reference']:
"""Turn list of strings into list of Reference instances."""
res = []
for s in list_:
if isinstance(s, cls):
res.append(s)
continue
if s.strip():
try:
res.append(cls.from_string(s, pattern=pattern))
except AssertionError as e: # pragma: no cover
raise ValueError(f'invalid ref: {s}') from e
return res
@dataclasses.dataclass
class Country:
"""
Glottolog languoids can be related to the countries they are spoken in. These
countries are identified by ISO 3166 Alpha-2 codes.
.. see also:: https://en.wikipedia.org/wiki/ISO_3166-1
"""
id: str #: ISO 3166 alpha 2 code
name: str
def __str__(self):
return self._format()
def _format(self, minimal=True):
return f'{self.name} ({self.id})' if not minimal else f'{self.id}'
@classmethod
def from_name(cls, name) -> Optional['Country']: # pylint: disable=C0116
res = pycountry.countries.get(name=name)
if res:
return cls(id=res.alpha_2, name=res.name)
return None # pragma: no cover
@classmethod
def from_id(cls, id_) -> Optional['Country']: # pylint: disable=C0116
res = pycountry.countries.get(alpha_2=id_)
if res:
return cls(id=res.alpha_2, name=res.name)
return None # pragma: no cover
@classmethod
def from_text(cls, text: str) -> Optional['Country']:
"""Instantiate country based on alpha_2 code or name."""
match = re.search(r'(?P<code_only>^[A-Z]{2}$)|\(?(?P<code>[A-Z]{2})\)?', text)
if match:
code = match.group('code_only') or match.group('code')
return cls.from_id(code)
return cls.from_name(text)
[docs]@dataclasses.dataclass
class ISORetirement: # pylint: disable=R0902
"""
Information extracted from accepted ISO 639-3 change requests about retired ISO codes
associated with the languoid.
"""
code: Optional[str] = None #: Retired ISO 639-3 code
name: Optional[str] = None #: Name of the retired ISO language
change_request: Optional[str] = None #: Number of the ISO change request
effective: Optional[str] = None #: Date of acceptance of the change request
reason: Optional[str] = None #: Reason to retire the ISO code
#: List of ISO codes replacing the retired code
change_to: list[str] = dataclasses.field(default_factory=list)
remedy: Optional[str] = None #: What to do about the retired code
comment: Optional[str] = None
def __post_init__(self):
self.comment = self.comment.replace('\n.', '\n') if self.comment else None
def asdict(self): # pylint: disable=C0116
return dataclasses.asdict(self)
__json__ = asdict
[docs]@dataclasses.dataclass
class Endangerment:
"""
Info about the endangerment status of the languoid
"""
status: AES
source: AESSource
comment: str
#: Date when the endangerment status was assessed
date: datetime.datetime
def __post_init__(self):
assert isinstance(self.status, AES)
assert isinstance(self.source, AESSource)
if isinstance(self.date, str):
self.date = parser.parse(self.date)
def __json__(self):
res = dataclasses.asdict(self)
res['date'] = res['date'].isoformat().split('T')[0]
return res
[docs] def check(self, lang: 'Languoid', keys: list[str], log: logging.Logger):
"""Check formatting of endangerment info."""
def repl(ml):
if ml.url not in keys:
log.error(message(lang, f'endangerment comment: invalid bibkey: {ml.url}'))
if self.source and self.source.reference_id:
ref = self.source.reference_id
if ref not in keys: # pragma: no cover
log.error(message(lang, f'endangerment: invalid bibkey {ref}'))
if self.comment:
MarkdownLink.replace(self.comment, repl)
CommentType = Literal['spurious', 'missing']