Source code for pyglottolog.references.bibfiles

# bibfiles.py - ordered collection of bibfiles with load/save api

import re
import math
import typing
import pathlib
import datetime
import functools
import collections
import unicodedata

import attr

from clldutils.path import memorymapped
from clldutils.source import Source
from clldutils.text import split_text
from clldutils.inifile import INI
from clldutils.attrlib import cmp_off

from . import bibtex
from . import util
from ..config import MEDType
from .bibfiles_db import Database

__all__ = ['BibFiles', 'BibFile', 'Entry']

BIBFILES = 'bibfiles.sqlite3'

DOCTYPES = {k: k for k in ['grammar',
                           'grammar_sketch',
                           'dictionary',
                           'specific_feature',
                           'phonology',
                           'text',
                           'new_testament',
                           'wordlist',
                           'comparative',
                           'minimal',
                           'socling',
                           'dialectology',
                           'overview',
                           'ethnographic',
                           'bibliographical',
                           'unknown']}

PREF_YEAR_PATTERN = re.compile(r'\[(?P<year>(1|2)[0-9]{3})(\-[0-9]+)?\]')

YEAR_PATTERN = re.compile(r'(?P<year>(1|2)[0-9]{3})')


[docs]class BibFiles(list): """Ordered collection of `BibFile` objects accessible by filname or index."""
[docs] @classmethod def from_path(cls, path: typing.Union[str, pathlib.Path], api=None) -> 'BibFiles': """BibTeX files from `<path>/bibtex/*.bib` if listed in `<path>/BIBFILES.ini`.""" path = pathlib.Path(path) ini = INI.from_file(path / 'BIBFILES.ini', interpolation=None) return cls(cls._iterbibfiles(ini, path / 'bibtex', api=api))
@staticmethod def _iterbibfiles(ini, bibtex_path, api=None): for sec in ini.sections(): if sec.endswith('.bib'): fpath = bibtex_path / sec if not fpath.exists(): # pragma: no cover raise ValueError('invalid bibtex file referenced in BIBFILES.ini') yield BibFile(fname=fpath, api=api, **ini[sec]) def __init__(self, bibfiles): super().__init__(bibfiles) self._map = {b.fname.name: b for b in self}
[docs] def __getitem__(self, index_or_filename: typing.Union[int, str])\ -> typing.Union['BibFile', 'Entry']: """Retrieve a bibfile by index or filename or an entry by qualified key. :param index_or_filename: Either an `int` index, or a bibfile name, or a \ provider-qualified BibTeX key in the form `<prov>:<key>`. :return: A `BibFile` instance, or an `Entry` instance. """ if isinstance(index_or_filename, str): if ':' in index_or_filename: stem, key = index_or_filename.split(':', maxsplit=1) return self._map['{}.bib'.format(stem)][key] if not index_or_filename.endswith('.bib'): index_or_filename += '.bib' return self._map[index_or_filename] return super().__getitem__(index_or_filename)
[docs] def to_sqlite(self, filepath=BIBFILES, rebuild=False, verbose=False): """Return a database with the bibfiles loaded.""" return Database.from_bibfiles(self, filepath, rebuild=rebuild, verbose=verbose)
[docs] def roundtrip_all(self): """Load and save all bibfiles with the current settings.""" return [b.roundtrip() for b in self]
def file_if_exists(i, a, value): if value.exists() and not value.is_file(): raise ValueError('invalid path') # pragma: no cover
[docs]@attr.s class BibFile(object): """ Represents a BibTeX file, storing a provider's bibliography, providing easy access to its records. """ fname: pathlib.Path = attr.ib(validator=file_if_exists) name = attr.ib(default=None) #: Short name of the bibliography title = attr.ib(default=None) #: Title of the bibliography description = attr.ib(default=None) #: The provenance of the bibliography abbr = attr.ib(default=None) encoding = attr.ib(default='utf-8') normalize = attr.ib(default='NFC') sortkey = attr.ib( default=None, converter=lambda s: None if s is None or s.lower() == 'none' else s) priority = attr.ib(default=0, converter=int) url = attr.ib(default=None) #: URL pointing to the source of the bibliography curation = attr.ib(default=None) #: Curation policy for the bibliography at Glottolog api = attr.ib(default=None) @property def id(self): return self.fname.stem
[docs] def __getitem__(self, item: str) -> 'Entry': """ :param item: BibTeX citation key of an entry :raises KeyError: if no matching `Entry` is contained in the `BibFile` """ if item.startswith(self.id + ':'): item = item.split(':', 1)[1] text = None with memorymapped(self.fname) as string: m = re.search( b'@[A-Za-z]+{' + re.escape(item.encode(self.encoding)) + rb'[\s,]', string) if m: next = string.find(b'\n@', m.end()) if next >= 0: text = string[m.start():next] else: text = string[m.start():] if text: for k, (t, f) in bibtex.iterentries_from_text(text, encoding=self.encoding): return Entry(k, t, f, self, self.api) raise KeyError(item)
def visit(self, visitor=None): entries = collections.OrderedDict() for entry in self.iterentries(): if visitor is None or visitor(entry) is not True: entries[entry.key] = (entry.type, entry.fields) self.save(entries) @property def size(self): return self.fname.stat().st_size @property def mtime(self): return datetime.datetime.fromtimestamp(self.fname.stat().st_mtime) def iterentries(self): for k, (t, f) in bibtex.iterentries(filename=self.fname, encoding=self.encoding): yield Entry(k, t, f, self, self.api) def keys(self): return ['{0}:{1}'.format(self.id, e.key) for e in self.iterentries()] @property def glottolog_ref_id_map(self) -> typing.Dict[str, str]: return { e.key: e.fields['glottolog_ref_id'] for e in self.iterentries() if 'glottolog_ref_id' in e.fields} def update(self, fname, log=None, keep_old=False): entries, new = collections.OrderedDict(), 0 if keep_old: for k, (t, f) in bibtex.iterentries(filename=self.fname, encoding=self.encoding): entries[k] = (t, f) ref_id_map = self.glottolog_ref_id_map for key, (type_, fields) in bibtex.iterentries(fname, self.encoding): if key in ref_id_map and 'glottolog_ref_id' not in fields: fields['glottolog_ref_id'] = ref_id_map[key] else: new += 1 entries[key] = (type_, fields) self.save(entries) if log: # pragma: no cover log.info('{0} new entries'.format(new))
[docs] def load(self, preserve_order=None): """Return entries as bibkey -> (entrytype, fields) dict.""" if preserve_order is None: preserve_order = self.sortkey is None return bibtex.load(self.fname, preserve_order, encoding=self.encoding)
[docs] def save(self, entries): """Write bibkey -> (entrytype, fields) map to file.""" bibtex.save( entries, filename=self.fname, sortkey=self.sortkey, encoding=self.encoding, normalize=self.normalize)
[docs] def __str__(self): return f'<{self.__class__.__name__} {self.fname.name}>'
def check(self, log): entries = self.load() # bare BibTeX syntax invalid = bibtex.check(filename=self.fname) # names/macros etc. verdict = ('(%d invalid)' % invalid) if invalid else 'OK' method = log.warn if invalid else log.info method('%s %d %s' % (self, len(entries), verdict)) return len(entries), verdict def roundtrip(self): print(self) self.save(self.load())
[docs] def show_characters(self, include_plain=False): """Display character-frequencies (excluding printable ASCII).""" with self.fname.open(encoding=self.encoding) as fd: text = fd.read() hist = collections.Counter(text) table = '\n'.join( '%d\t%-9r\t%s\t%s' % (n, c, c, unicodedata.name(c, '')) for c, n in hist.most_common() if include_plain or not 20 <= ord(c) <= 126) print(table)
[docs]@functools.total_ordering @attr.s(**cmp_off) class Entry(object): """ Represents an entry in a `BibFile`, i.e. a bibliographical record. .. note:: `Entry` instances are orderable. The ordering is the one used to compute MEDs, i.e. - grammars are "better" than other document types, - more pages is "better" than less, - more recent is "better" than old. .. code-block:: python >>> g = pyglottolog.Glottolog() >>> g.bibfiles['hh:g:MacDonell:Sanskrit'] > g.bibfiles['hh:hv:Weijnen:Nederlandse'] True >>> refs = g.refs_by_languoid(gl.bibfiles['hh']) >>> sorted(refs[0]['stan1295'])[-1].med_type.name 'long grammar' """ key = attr.ib() #: type = attr.ib() #: BibTeX entry type fields: dict = attr.ib() #: The metadata of the record bib = attr.ib() api = attr.ib(default=None) # FIXME: add method to apply triggers! lgcode_regex = r'[a-z0-9]{4}[0-9]{4}|[a-z]{3}|NOCODE_[A-Z][^\s\]]+' lgcode_in_brackets_pattern = re.compile(r"\[(" + lgcode_regex + r")]") recomma = re.compile(r"[,/]\s?") lgcode_pattern = re.compile(lgcode_regex + "$") def __eq__(self, other): return self.weight == other.weight def __ne__(self, other): return not (self == other) def __lt__(self, other): return self.weight < other.weight @property def _defined_doctypes(self): return collections.OrderedDict((hht.id, hht.id) for hht in self.api.hhtypes) \ if self.api else DOCTYPES @functools.cached_property def weight(self): doctypes = self._defined_doctypes index = len(doctypes) doctype = None for _doctype in self.doctypes(doctypes)[0]: index = list(doctypes.values()).index(_doctype) doctype = getattr(_doctype, 'id', _doctype) break # the number of pages is divided by number of doctypes times number of described languages pages = int(math.ceil( float(self.pages_int or 0) / # noqa: W504 ((len(self.doctypes(doctypes)[0]) or 1) * # noqa: W504 (len(self.lgcodes(self.fields.get('lgcode', ''))) or 1)))) if doctype == 'grammar' and pages >= 300: index = -1 return -index, pages, self.year_int or 0, self.id @functools.cached_property def med_type(self) -> MEDType: """ The entry's type on the MED scale. """ if self.api: doctypes = list(self._defined_doctypes.keys()) index = -self.weight[0] if index == -1: return self.api.med_types.long_grammar if 'dictionary' in doctypes and index < doctypes.index('dictionary'): return self.api.med_types.get(doctypes[index]) if 'wordlist' in doctypes and index < doctypes.index('wordlist'): return self.api.med_types.phonology_or_text return self.api.med_types.wordlist_or_less @functools.cached_property def year_int(self): if self.fields.get('year'): # prefer years in brackets over the first 4-digit number. match = PREF_YEAR_PATTERN.search(self.fields.get('year')) if match: return int(match.group('year')) match = YEAR_PATTERN.search(self.fields.get('year')) if match: return int(match.group('year')) @functools.cached_property def pages_int(self): if self.fields.get('numberofpages'): try: pages = int(self.fields.get('numberofpages').strip()) if pages < util.MAX_PAGE: return pages except ValueError: pass if self.fields.get('pages'): return util.compute_pages(self.fields['pages'])[2] @functools.cached_property def publisher_and_address(self): p = self.fields.get('publisher') if p and ':' in p: address, publisher = [s.strip() for s in p.split(':', 1)] if (not self.fields.get('address')) or self.fields['address'] == address: return publisher, address return p, self.fields.get('address') def __str__(self): """Return the BibTeX representation of the entry.""" res = "@%s{%s" % (self.type, self.key) for k, v in bibtex.fieldorder.itersorted(self.fields): res += ',\n %s = {%s}' % (k, v.strip() if hasattr(v, 'strip') else v) res += '\n}\n' if self.fields else ',\n}\n' return res
[docs] def text(self) -> str: """Return the text linearization of the entry.""" return Source(self.type, self.key, _check_id=False, **self.fields).text()
@property def id(self) -> str: """ The qualified entry ID, including the provider prefix. """ return '{0}:{1}'.format(self.bib.id, self.key) @classmethod def lgcodes(cls, string): if string is None: return [] codes = cls.lgcode_in_brackets_pattern.findall(string) if not codes: # ... or as comma separated list of identifiers. parts = [p.strip() for p in cls.recomma.split(string)] codes = [p for p in parts if cls.lgcode_pattern.match(p)] if len(codes) != len(parts): codes = [] return codes @staticmethod def parse_ca(s): if s: match = re.search('computerized assignment from "(?P<trigger>[^\"]+)"', s) if match: return match.group('trigger')
[docs] def languoids(self, langs_by_codes: dict) -> typing.Tuple[list, typing.Optional[str]]: """ Expand the language codes mentioned in a reference's "lgcode" field to `Languoid` objects. """ res = [] if 'lgcode' in self.fields: for code in self.lgcodes(self.fields['lgcode']): if code in langs_by_codes: res.append(langs_by_codes[code]) return res, self.parse_ca(self.fields.get('lgcode'))
[docs] def doctypes(self, hhtypes): """Ordered doctypes assigned to this entry. :param hhtypes: `OrderedDict` mapping doctype names to doctypes :return: `list` of values of `hhtypes` which apply to the entry, ordered by occurrence in\ `hhtypes`. """ res = set() if 'hhtype' in self.fields: for ss in split_text(self.fields['hhtype'], separators=',;'): ss = ss.split('(')[0].strip() if ss in hhtypes: res.add(ss) return [v for k, v in hhtypes.items() if k in res], self.parse_ca(self.fields.get('hhtype'))