Source code for glypy.io.nomenclature.identity

from collections import defaultdict
import operator

from six import string_types as basestring

from ...utils import groupby
from ...structure import (
    named_structures, Monosaccharide, Substituent,
    Anomer, Stem, RingType,
    SuperClass, Configuration)
from ...algorithms.similarity import (monosaccharide_similarity, has_substituent,
                                      has_modification, has_monosaccharide,
                                      is_generic_monosaccharide)
from ...composition.composition_transform import strip_derivatization
from .synonyms import monosaccharides as monosaccharide_synonyms


def has_ambiguity(node):
    ambiguous = node.stem[0] is Stem.x or node.anomer is Anomer.x or\
        node.superclass is SuperClass.x or node.ring_type is RingType.x
    return ambiguous


# A static copy of monosaccharide names to structures for copy-free comparison
monosaccharides = dict(named_structures.monosaccharides)
monosaccharides_ordered = sorted(list(monosaccharides.items()), key=lambda x: has_ambiguity(x[1]))


[docs]def get_preferred_name(name, selector=min, key=len): ''' Given a name, of its synonyms, find the name that satisfies the `selector` criterion function (:func:`min`) based on some `key` function of the name (:func:`len`) Parameters ---------- name: str Given name to compare to synonyms selector: function Function to use to select the preferred name by some statistic key: function Function to use to convert names into statistics Returns ------- str ''' preferred_name = selector(monosaccharide_synonyms.get(name, [name]) + [name], key=key) return preferred_name
[docs]def is_a(node, target, tolerance=0, include_modifications=True, include_substituents=True, exact=True, short_circuit=False, ignore_ring=True, **kwargs): ''' Perform a semi-fuzzy match between `node` and `target` where node is the unqualified residue queried and target is the known residue to be matched against. Forwards all unmatched arguments to :func:`~.monosaccharide_similarity` Parameters ---------- node: Monosaccharide or Substituent Object to be identified target: Monosaccharide, Substituent or str The reference type. May be a |str| object which is used to look up a |Monosaccharide| by name in :obj:`glypy.monosaccharides` tolerance: int The error tolerance for the search include_modifications: bool Whether or not to include modifications in comparison. Defaults to |True| include_substituents: bool Whether or not to include substituents in comparison. Defaults to |True| exact: bool Whether or not to penalize for unmatched attachments. Defaults to |True| Returns ------- bool ''' res = 0 qs = 0 if isinstance(target, basestring): target = monosaccharides[target] if isinstance(node, Substituent): if not isinstance(target, Substituent): return False else: res += node.name == target.name qs += 1 else: if not isinstance(target, Monosaccharide): return False res, qs = monosaccharide_similarity(node, target, include_modifications=include_modifications, include_substituents=include_substituents, include_children=False, exact=exact, ignore_ring=ignore_ring, short_circuit_after=tolerance if short_circuit else None, **kwargs) threshold = (qs - res) <= tolerance return threshold
[docs]def identify(node, blacklist=None, tolerance=0, include_modifications=True, include_substituents=True, ignore_ring=True, **kwargs): ''' Attempt to find a common usage name for the given |Monosaccharide|, `node`. The name is determined by performing an incremental comparison of the traits of `node` with each named residue in the database accessed at :obj:`glypy.monosaccharides`. Forwards all unmatched arguments to :func:`~.monosaccharide_similarity` Parameters ---------- node: Monosaccharide Object to be identified blacklist: list The set of all monosaccharides to not attempt matching against, because they are too general. tolerance: int The error tolerance for the search include_modifications: bool Whether or not to include modifications in comparison. Defaults to |True| include_substituents: bool Whether or not to include substituents in comparison. Defaults to |True| Returns ------- str Raises ------ IdentifyException: When a suitable name cannot be found. See Also -------- is_a preferred_name monosaccharide_similarity ''' if blacklist is None: blacklist = {"Pen", "Hex", "Hep", "Oct", "Non"} for name, structure in monosaccharides_ordered: if name in blacklist: continue if is_a(node, structure, tolerance, include_modifications, include_substituents, ignore_ring=ignore_ring, **kwargs): return get_preferred_name(name) raise IdentifyException("Could not identify {}".format(node))
[docs]class IdentifyException(KeyError): pass
[docs]def naive_name_monosaccharide(monosaccharide): ''' Generate a generic name for `monosaccharide`, based loosely on IUPAC naming schema without including information about linkage. The tendency for monosaccharides of superclass > 7 to have special names, which will be used preferentially if possible. Parameters ---------- monosaccharide: Monosaccharide Returns ------- str: A simple name based on `SuperClass`, modifications, and substituents. See Also -------- :func:`glypy.io.nomenclature.identity.identify` ''' try: c = monosaccharide.clone() if not isinstance(c, Monosaccharide): return None strip_derivatization(c) try: if monosaccharide.superclass.value > 6: return identify(c, tolerance=0) except Exception: pass c.anomer = None return identify(c) except IdentifyException: try: c.stem = None c.configuration = None return identify(c) except IdentifyException: return "".join(mod.name for mod in list(c.modifications.values()) if mod.name != 'aldi') +\ c.superclass.name.title() + ''.join([''.join(map(str.title, subst.name.split("_")))[:3] for p, subst in c.substituents()])
def split_along_axis(monosaccharides, axis): getter = operator.attrgetter(axis) groups = groupby(monosaccharides, getter) return groups def residue_list_to_tree(monosaccharides, axes=('anomer', 'superclass', 'stem', 'configuration')): root = split_along_axis(monosaccharides, axes[0]) if len(axes) > 1: for level, group in list(root.items()): root[level] = residue_list_to_tree(group, axes[1:]) return root class MonosaccharideIdentifier(object): def __init__(self, reference_index=None, **kwargs): if reference_index is None: reference_index = dict(named_structures.monosaccharides) self.reference_index = dict(reference_index) self.trait_tree = residue_list_to_tree(set(self.reference_index.values())) self.name_map = self._build_name_map() def _build_name_map(self): by_monosaccharide = groupby(self.reference_index.items(), lambda x: x[1]) monosaccharide_to_name = { k: min([vi[0] for vi in v], key=len) for k, v in by_monosaccharide.items() } return monosaccharide_to_name def _find_potential_matches(self, monosaccharide, exact_candidates=False, **kwargs): anomer = monosaccharide.anomer candidates = [] members = self.trait_tree[anomer] candidates.append(members) if anomer != Anomer.x: candidates.append(self.trait_tree[Anomer.x]) superclass = monosaccharide.superclass next_candidates = [] for candidate in candidates: if not candidate: continue next_candidates.append(candidate[superclass]) if superclass != SuperClass.x: next_candidates.append(candidate[SuperClass.x]) candidates = next_candidates next_candidates = [] stem = monosaccharide.stem for candidate in candidates: if not candidate: continue next_candidates.append(candidate[stem]) if stem != (Stem.x, ): next_candidates.append(candidate[(Stem.x, )]) candidates = next_candidates next_candidates = [] configuration = monosaccharide.configuration for candidate in candidates: if not candidate: continue next_candidates.append(candidate[configuration]) if configuration != (Configuration.x, ): next_candidates.append(candidate[(Configuration.x, )]) candidates = [] for c in next_candidates: candidates.extend(c) is_a_potential = {} kwargs.setdefault('exact', True) kwargs.setdefault('treat_null_as_wild', False) kwargs.setdefault('match_attachement_positions', True) for c in candidates: if is_a(monosaccharide, c, exact=exact_candidates): a, b = monosaccharide_similarity( monosaccharide, c, **kwargs) is_a_potential[c] = a / float(b) return is_a_potential def query(self, monosaccharide, **kwargs): is_a_potential = self._find_potential_matches(monosaccharide, **kwargs) if not is_a_potential: return None match = max(is_a_potential.items(), key=lambda x: x[1])[0] return match def identify(self, monosaccharide, **kwargs): template = self.query(monosaccharide, **kwargs) if template is not None: return self.name_map[template] else: raise IdentifyException(monosaccharide)