Source code for ihm.dictionary

"""Classes to read in and represent an mmCIF extension dictionary"""

import ihm.reader
import ihm.format
import ihm.format_bcif
import re
import itertools

from ihm.reader import Handler


# Handle special values for CIF data items ('.', '?', or missing entirely)
class _CifSpecialValue(object):
    pass


class _NotInFileCif(_CifSpecialValue):
    pass


class _OmittedCif(_CifSpecialValue):
    pass


class _UnknownCif(_CifSpecialValue):
    pass


class _KeywordEnumeration(set):
    """Set of possible values for a keyword. Can be case insensitive."""
    def __init__(self):
        super(_KeywordEnumeration, self).__init__()
        self.case_sensitive = True
        self._upper_set = None

    def add(self, item):
        self._upper_set = None  # Invalidate upper_set
        super(_KeywordEnumeration, self).add(item)

    def __contains__(self, item):
        if self.case_sensitive:
            return super(_KeywordEnumeration, self).__contains__(item)
        else:
            if self._upper_set is None:
                self._upper_set = set(x.upper() for x in self)
            return item.upper() in self._upper_set


[docs] class ValidatorError(Exception): """Exception raised if a file fails to validate. See :meth:`Dictionary.validate`.""" pass
class _ValidatorCategoryHandler(Handler): # Handle special values for CIF data items ('.', '?', or missing entirely) # explicitly, rather the default behavior (mapping to None or '?') not_in_file = _NotInFileCif() omitted = _OmittedCif() unknown = _UnknownCif() def __init__(self, sysr, category): super(_ValidatorCategoryHandler, self).__init__(sysr) self.category = '_' + category.name self.category_obj = category self._keys = [k.lower() for k in category.keywords.keys()] self.link_keys = set() li = sysr.dictionary.linked_items for link in itertools.chain(li.keys(), li.values()): cat, key = link.split('.') if cat == self.category: self.link_keys.add(key) def __call__(self, *args): self.sysr.validate_data(self.category_obj, self._keys, args, self.link_keys) class _ValidatorReader(object): """Track information used for validation while reading an mmCIF file""" def __init__(self, dictionary): self.dictionary = dictionary self._seen_categories = set() self._unknown_categories = set() self._unknown_keywords = set() # Keep track of all values (IDs) seen for keys that are involved in # parent-child relationships self._seen_ids = {} li = dictionary.linked_items for link in itertools.chain(li.keys(), li.values()): self._seen_ids[link] = set() self.errors = [] def validate_data(self, category, keywords, args, link_keys): self._seen_categories.add(category.name) for key, value in zip(keywords, args): if key in link_keys and not isinstance(value, _CifSpecialValue): self._seen_ids["_%s.%s" % (category.name, key)].add(value) kwobj = category.keywords[key] if kwobj.mandatory: if isinstance(value, _UnknownCif): self.errors.append("Mandatory keyword %s.%s cannot have " "value '?'" % (category.name, key)) elif isinstance(value, _NotInFileCif): self.errors.append("Mandatory keyword %s.%s cannot be " "missing from the file" % (category.name, key)) if isinstance(value, _CifSpecialValue): continue if kwobj.enumeration and value not in kwobj.enumeration: self.errors.append("Keyword %s.%s value %s is not a valid " "enumerated value (options are %s)" % (category.name, key, value, ", ".join(sorted(kwobj.enumeration)))) if kwobj.item_type and not kwobj.item_type.regex.match(str(value)): self.errors.append("Keyword %s.%s value %s does not match " "item type (%s) regular expression (%s)" % (category.name, key, value, kwobj.item_type.name, kwobj.item_type.construct)) def _check_mandatory_categories(self): all_categories = self.dictionary.categories mandatory_categories = [c.name for c in all_categories.values() if c.mandatory] missing = set(mandatory_categories) - self._seen_categories if missing: self.errors.append( "The following mandatory categories are missing " "in the file: %s" % ", ".join(sorted(missing))) def _check_linked_items(self): """Check to make sure any ID referenced by a child item is defined in the parent""" for child, parent in self.dictionary.linked_items.items(): if not self._seen_ids[child] <= self._seen_ids[parent]: # Strip _ prefix from category cat, key = parent[1:].split('.') # Only warn about relationships where the parent is defined # in this dictionary (e.g. a lot of IHM items point back # to PDBx categories) # Chemical component dictionary checks are handled elsewhere; # the chem_comp_* categories don't need to be fully populated if cat in self.dictionary.categories \ and not cat.startswith('chem_comp_'): missing = sorted(self._seen_ids[child] - self._seen_ids[parent]) self.errors.append( "The following IDs referenced by %s " "were not defined in the parent category (%s): %s" % (child, parent, ", ".join(missing))) def _check_unknown(self): """Report errors for any unknown keywords or categories""" if self._unknown_categories: self.errors.append( "The following categories are not defined in the " "dictionary: %s" % ", ".join(sorted(self._unknown_categories))) if self._unknown_keywords: self.errors.append( "The following keywords are not defined in the dictionary: %s" % ", ".join(sorted(self._unknown_keywords))) def report_errors(self): self._check_mandatory_categories() self._check_linked_items() self._check_unknown() if self.errors: raise ValidatorError("\n\n".join(self.errors)) class _UnknownCategoryHandler(object): def __init__(self, sysr): self.sysr = sysr def __call__(self, catname, line): self.sysr._unknown_categories.add(catname) class _UnknownKeywordHandler(object): def __init__(self, sysr): self.sysr = sysr def __call__(self, catname, keyname, line): self.sysr._unknown_keywords.add("%s.%s" % (catname, keyname))
[docs] class Dictionary(object): """Representation of an mmCIF dictionary. See :func:`read` to create a Dictionary from a file. Multiple Dictionaries can be added together to yield a Dictionary that includes all the data in the original Dictionaries. See the `validator example <https://github.com/ihmwg/python-ihm/blob/main/examples/validate_pdb_dev.py>`_ for an example of using this class.""" # noqa: E501 def __init__(self): #: Mapping from name to :class:`Category` objects self.categories = {} #: Links between items; keys are children, values are parents e.g. #: ``linked_items['_ihm_starting_model_details.asym_id'] = #: '_struct_asym.id'`` self.linked_items = {} def __iadd__(self, other): for name, cat in other.categories.items(): if name in self.categories: # If both dictionaries contain information on the same # category, combine it self.categories[name]._update(cat) else: self.categories[name] = cat self.linked_items.update(other.linked_items) return self def __add__(self, other): d = Dictionary() d += self d += other return d
[docs] def validate(self, fh, format='mmCIF'): """Validate the given file against this dictionary. :param file fh: The file handle to read from. :param str format: The format of the file. This can be 'mmCIF' (the default) for the (text-based) mmCIF format or 'BCIF' for BinaryCIF. :raises: :class:`ValidatorError` if the file fails to validate. """ reader_map = {'mmCIF': ihm.format.CifReader, 'BCIF': ihm.format_bcif.BinaryCifReader} s = _ValidatorReader(self) uchandler = _UnknownCategoryHandler(s) ukhandler = _UnknownKeywordHandler(s) r = reader_map[format](fh, {}, unknown_category_handler=uchandler, unknown_keyword_handler=ukhandler) handlers = [_ValidatorCategoryHandler(s, cat) for cat in self.categories.values()] r.category_handler = dict((h.category, h) for h in handlers) # Read all data blocks while r.read_file(): pass s.report_errors()
[docs] class Category(object): """Representation of a single category in a :class:`Dictionary`.""" def __init__(self): #: Category name self.name = None #: Human-readable text self.description = None #: Mapping from name to :class:`Keyword` objects self.keywords = {} #: True iff this category is required in a compliant mmCIF file self.mandatory = None def _update(self, other): """Update with information from another Category object""" assert other.name == self.name self.keywords.update(other.keywords) self.description = self.description or other.description if self.mandatory is None: # e.g. if other.mandatory is False and self.mandatory is None # we want to use False; "None or False" returns None. self.mandatory = other.mandatory else: self.mandatory = self.mandatory or other.mandatory
class _DoNothingRegEx(object): """A mock regex object which always matches""" def match(self, value): return True
[docs] class ItemType(object): """Represent the type of a data item. This keeps the set of valid strings for values of a given :class:`Keyword`. For example, integer values can only contain the digits 0-9 with an optional +/- prefix.""" def __init__(self, name, primitive_code, construct): self.name = name # The dictionary only defines matches against ASCII characters. # Extend this to match any Unicode "word" character so we don't # fail to validate as soon as we see an accented character. self.construct = construct.replace('A-Za-z0-9', r'\w') self.primitive_code = primitive_code # Ensure that regex matches the entire value try: self.regex = re.compile(self.construct + '$') except re.error: # Some CIF regexes aren't valid Python regexes; skip these self.regex = _DoNothingRegEx() case_sensitive = property(lambda x: x.primitive_code != 'uchar', doc='True iff this type is case sensitive')
[docs] class Keyword(object): """Representation of a single keyword in a :class:`Category`.""" def __init__(self): #: Keyword name self.name = None #: True iff this keyword is required in a compliant mmCIF file self.mandatory = None #: Set of acceptable values, or None self.enumeration = None #: :class:`ItemType` for this keyword, or None self.item_type = None
class _DictionaryReader(object): """Track information for a Dictionary being read from a file.""" def __init__(self): self.dictionary = Dictionary() self.item_types = {} # Mapping from name to ItemType object self._reset_category() self._reset_keyword() def _reset_category(self): self.category = Category() self.category_good = False def _reset_keyword(self): self._keyword_info = [] self._keyword_item_type = None self._keyword_enumeration = None self.keyword_good = False def end_save_frame(self): if self.keyword_good: for (name, category, mandatory) in self._keyword_info: k = Keyword() k.name, k.mandatory = name.lower(), mandatory k.enumeration = self._keyword_enumeration k.item_type = self._keyword_item_type # If the owning category does not exist, make it; this can # happen if we extend something in the core dictionary # (e.g. atom_site.ihm_model_id) if category not in self.dictionary.categories: c = Category() c.name = category self.dictionary.categories[c.name] = c else: c = self.dictionary.categories[category] c.keywords[k.name] = k self._reset_keyword() if self.category_good: c = self.category if c.name in self.dictionary.categories: # Handle case where keywords were defined before category self.dictionary.categories[c.name]._update(c) else: self.dictionary.categories[c.name] = c self._reset_category() class _CategoryHandler(Handler): category = '_category' def __call__(self, id, description, mandatory_code): c = self.sysr.category c.name, c.description = id, description c.mandatory = self.get_bool(mandatory_code) self.sysr.category_good = True def end_save_frame(self): self.sysr.end_save_frame() class _ItemHandler(Handler): category = '_item' def __call__(self, name, category_id, mandatory_code): cat, name = name.split('.') ki = self.sysr._keyword_info # If category_id is missing, strip leading _ from the keyword's # own category name and use that instead if category_id is None: category_id = cat[1:] ki.append((name, category_id, self.get_bool(mandatory_code))) self.sysr.keyword_good = True class _ItemEnumerationHandler(Handler): category = '_item_enumeration' def __call__(self, value): if self.sysr._keyword_enumeration is None: self.sysr._keyword_enumeration = _KeywordEnumeration() self.sysr._keyword_enumeration.add(value) class _ItemTypeListHandler(Handler): category = '_item_type_list' def __call__(self, code, primitive_code, construct): it = ItemType(code, primitive_code, construct) self.sysr.item_types[it.name] = it class _ItemTypeHandler(Handler): category = '_item_type' def __call__(self, code): self.sysr._keyword_item_type = code def finalize(self): for c in self.sysr.dictionary.categories.values(): for k in c.keywords.values(): if k.item_type is not None: # Map unrecognized type codes to None # For example, the ihm dictionary often uses the # 'atcode' type which is not defined in the dictionary # itself (but presumably is in the base PDBx dict) k.item_type = self.sysr.item_types.get(k.item_type) if k.item_type is not None and k.enumeration: k.enumeration.case_sensitive = k.item_type.case_sensitive class _ItemLinkedHandler(Handler): category = '_item_linked' def __call__(self, child_name, parent_name): self.sysr.dictionary.linked_items[child_name] = parent_name
[docs] def read(fh): """Read dictionary data from the mmCIF file handle `fh`. :return: The dictionary data. :rtype: :class:`Dictionary` """ r = ihm.format.CifReader(fh, {}) s = _DictionaryReader() handlers = [_CategoryHandler(s), _ItemHandler(s), _ItemEnumerationHandler(s), _ItemTypeListHandler(s), _ItemTypeHandler(s), _ItemLinkedHandler(s)] r.category_handler = dict((h.category, h) for h in handlers) r.read_file() for h in handlers: h.finalize() return s.dictionary