Source code for glypy.io.iupac

import re
import warnings

from collections import deque, namedtuple
from functools import partial

from glypy.structure import (
    Monosaccharide, Glycan, Link, AmbiguousLink,
    Substituent, constants, named_structures, UnknownPosition)
from glypy.composition import Composition
from glypy.composition.structure_composition import substituent_compositions
from glypy.composition.composition_transform import has_derivatization, derivatize
from glypy.io import format_constants_map
from glypy.io.nomenclature import identity
from glypy.utils import invert_dict

from glypy.io.file_utils import ParserInterface, ParserError


# A static copy of monosaccharide names to structures for copy-free comparison
monosaccharide_reference = {k: v for k, v in named_structures.monosaccharides.items()}

special_base_types = {
    # "Neu5Ac", "Neu5Gc", "Neu",
    # "Kdn", "Kdo",
    "Oli", "Tyv",
    "Psi", "Fru", "Sor", "Tag",
    "Xul", "Sed"
}

special_base_types = {
    s: monosaccharide_reference[s]
    for s in special_base_types
}

special_base_type_resolver = identity.MonosaccharideIdentifier(special_base_types)

anomer_map_from = dict(format_constants_map.anomer_map)
anomer_map_from['?'] = anomer_map_from.pop('x')
anomer_map_to = invert_dict(anomer_map_from)
anomer_map_from['beta'] = anomer_map_from['b']
anomer_map_from['alpha'] = anomer_map_from['a']
anomer_map_from[u"\u03B1"] = anomer_map_from['a']
anomer_map_from[u"\u03B2"] = anomer_map_from['b']


Stem = constants.Stem
Configuration = constants.Configuration
Modification = constants.Modification
SuperClass = constants.SuperClass


def tryint(i):
    try:
        return int(i)
    except ValueError:
        return -1


[docs]class IUPACError(ParserError): pass
def _make_substituent_name(name): return ''.join(t.title() for t in name.split("_")).replace("(", "").replace(")", "") substituents_map_to = { name: _make_substituent_name(name) for name in substituent_compositions } # Special Cases substituents_map_to['n_acetyl'] = "NAc" substituents_map_to['n_glycolyl'] = "NGc" substituents_map_to['n_sulfate'] = "NS" substituents_map_to['sulfate'] = "S" substituents_map_to["methyl"] = "Me" substituents_map_to["acetyl"] = "Ac" substituents_map_to["glycolyl"] = "Gc" substituents_map_to["fluoro"] = "F" substituents_map_to["amino"] = "N" substituents_map_to['phosphate'] = 'P' substituents_map_to['phospho_ethanolamine'] = 'PEtn' substituents_map_to['ethanolamine'] = 'Etn' substituents_map_from = invert_dict(substituents_map_to) substituents_map_from['Phosphate'] = 'phosphate' _modification_map_to = { 'deoxy': 'd', } _substituent_replacement_rules = { 'NeuAc': [ ('n_acetyl', 'acetyl') ], 'NeuGc': [ ('n_glycolyl', 'glycolyl') ], 'Neu': [ ('amino', None) ] }
[docs]class SubstituentSerializer(object): """Build the textual encoding for the relevant substituents for a provided monosaccharide. Attributes ---------- monosaccharide_reference : :class:`dict` Map base type to :class:`~.Monosaccharide` """ def __init__(self, monosaccharides=None, substitution_rules=None, substituent_map=None): if monosaccharides is None: monosaccharides = monosaccharide_reference if substitution_rules is None: substitution_rules = _substituent_replacement_rules if substituent_map is None: substituent_map = substituents_map_to self.monosaccharide_reference = monosaccharides self.substitution_rules = substitution_rules self.substituent_map = substituent_map def __call__(self, residue, **kwargs): """Alias for :meth:`resolve_substituents` """ return self.resolve_substituents(residue, **kwargs) def serialize_substituent(self, substituent): """Obtain an IUPAC-compatible name for ``substituent`` Parameters ---------- substituent : :class:`~.Substituent` The subsituent group to get the name for Returns ------- :class:`str` """ name = substituent.name if name in self.substituent_map: part = self.substituent_map[name] else: part = _make_substituent_name(name) warnings.warn("Registering IUPAC name %r for %r" % (name, part)) self.substituent_map[name] = part if part not in substituents_map_from: substituents_map_from[part] = name return part def resolve_substituents(self, residue, **kwargs): """Build a textual encoding of the substituent list for ``residue``. Parameters ---------- residue : :class:`~.Monosaccharide` The residue to build the substituent list for Returns ------- :class:`str` """ substituent = "" multi = False for name, pos in self.get_relevant_substituents(residue): if pos in {UnknownPosition, None}: pos = "" if name in self.substituent_map: part = self.substituent_map[name] else: part = _make_substituent_name(name) warnings.warn("Registering IUPAC name %r for %r" % (name, part)) self.substituent_map[name] = part if part not in substituents_map_from: substituents_map_from[part] = name # If there is a substituent after the first, successive ones are placed in parentheses if multi: substituent += "({}{})".format(pos, part) else: substituent += "{}{}".format(pos, part) multi = True return substituent def _test_for_replacement(self, residue, reference, positions, substituents, replacements, exact=False): if identity.is_a(residue, reference, exact=exact, short_circuit=True): self._substituent_replacement(positions, substituents, replacements) def _substituent_replacement(self, positions, substituents, pairs): for target, replacement in pairs: try: i = substituents.index(target) substituents.pop(i) j = positions.pop(i) if replacement is not None: substituents.insert(i, replacement) positions.insert(i, j) except Exception: # pragma: no cover pass def get_relevant_substituents(self, residue): ''' Retrieve the set of substituents not implicitly included in the base type's symbol name. Certain base types have implied substituent groups or partial substituent groups. For example, from the perspective of :mod:`glypy`, "n-acetyl" is a discrete unit, but from a structural perspective it is a substituted amine group that was later acetylated. The "Neu" base type implies an amination of carbon 5. In "Neu5Ac", the amine at carbon 5 is acetylated, but because the amine is implied, the N of the "NAc" signifier is omitted. In IUPAC's trivial coding, substituents are listed following the base type, with each substituent encoded as an optional position specifier followed immediately by a shortened version of the substituent group's name. The first substituent immediately follows the base type, and subsequent substituent groups are enclosed in parentheses. ''' monosaccharides = self.monosaccharide_reference positions = [p for p, sub in residue.substituents() if not sub._derivatize] substituents = [sub.name for p, sub in residue.substituents() if not sub._derivatize] for reference_name, replacements in self.substitution_rules.items(): reference = monosaccharides[reference_name] self._test_for_replacement(residue, reference, positions, substituents, replacements) return zip(substituents, positions)
resolve_substituent = SubstituentSerializer()
[docs]class ModificationSerializer(object): """Build the textual encoding for the relevant modifications for a provided monosaccharide base type. """ def extract_modifications(self, modifications, base_type): """Build a string representing the relevant modifications for. Certain base types imply a collection of modifications by default. These modifications should not be included in the textual encoding. In IUPAC's trivial coding, modifications are specified as a comma-separated list preceding the base definition with an optional position designation linked by a dash character to the modification name. For example "deoxy" and "?-deoxy" both signify a deoxidation at an unknown position, and "6-deoxy" indicates the deoxidation appears at carbon 6. Parameters ---------- modifications : :class:`~.MultiMap` A mapping between position and modifications base_type : :class:`str` The monosaccharide base type to build the list for. Uses a hard-coded list of modifications Returns ------- :class:`str` """ buff = [] template = '{position}-{name}' pos_mod_pairs = list(modifications.items()) try: pos, mods = map(list, zip(*pos_mod_pairs)) except ValueError: pos, mods = [], [] if "Neu" in base_type or "Kd" in base_type: for mod in [Modification.d, Modification.keto, Modification.a]: try: pop_ix = mods.index(mod) pos.pop(pop_ix) mods.pop(pop_ix) except Exception: # pragma: no cover pass elif "Fuc" in base_type or "Qui" in base_type or "Rha" in base_type: for mod in [Modification.d]: pop_ix = mods.index(mod) pos.pop(pop_ix) mods.pop(pop_ix) elif "Fru" in base_type or "Psi" in base_type or "Sor" in base_type or "Tag" in base_type: for mod in [Modification.keto]: pop_ix = mods.index(mod) pos.pop(pop_ix) mods.pop(pop_ix) pos_mod_pairs = zip(pos, mods) for pos, mod in pos_mod_pairs: if pos != UnknownPosition: buff.append(template.format(position=pos, name=mod.name)) else: buff.append(mod.name) out = ','.join(buff) if out: out += '-' return out def __call__(self, modifications, base_type): """An alias for :meth:`extract_modifications` """ return self.extract_modifications(modifications, base_type)
extract_modifications = ModificationSerializer()
[docs]class ModificationDeserializer(object): """Parses modification signifiers from text into position, :class:`~.Modification` pairs Attributes ---------- modification_map : :class:`dict` Mapping from text representation to :class:`~.Modification` to provide additional names for the existing modification name mapping. """ def __init__(self, modification_map=None): if modification_map is None: modification_map = _modification_map_to.copy() else: t = _modification_map_to.copy() t.update(modification_map) modification_map = t self.modification_map = modification_map def parse_modifications(self, modification_string): """Parses the text for site-modification definitions. In IUPAC's trivial coding, modifications are specified as a comma-separated list preceding the base definition with an optional position designation linked by a dash character to the modification name. For example "deoxy" and "?-deoxy" both signify a deoxidation at an unknown position, and "6-deoxy" indicates the deoxidation appears at carbon 6. Parameters ---------- modification_string : str Returns ------- list: The list of (position, modification) pairs parsed from the string Raises ------ IUPACError: If the modification signifier cannot be translated """ buff = modification_string.split(",") pairs = [] for token in buff: if token == '': continue try: pos, mod = token.split("-") except Exception: pos = UnknownPosition mod = token try: mod_t = self.modification_map.get(mod, mod) pos = int(pos) pairs.append((pos, Modification[mod_t])) except KeyError: raise IUPACError("Could not determine modification from %s" % modification_string) return pairs def __call__(self, modification_string): """An alias for :meth:`parse_modifications` """ return self.parse_modifications(modification_string)
parse_modifications = ModificationDeserializer()
[docs]class MonosaccharideSerializer(object): """Serialize a :class:`~.Monosaccharide` object to IUPAC text Attributes ---------- modification_extractor: :class:`ModificationSerializer` Convert modifications to a text list monosaccharide_reference : :class:`dict` Map base type to :class:`~.Monosaccharide` substituent_resolver : :class:`SubstituentSerializer` Convert substituents to a text list """ def __init__(self, monosaccharides=None, substituent_resolver=None, modification_extractor=None): if monosaccharides is None: monosaccharides = monosaccharide_reference self.monosaccharide_reference = monosaccharides if substituent_resolver is None: substituent_resolver = SubstituentSerializer(monosaccharides) self.substituent_resolver = substituent_resolver if modification_extractor is None: modification_extractor = ModificationSerializer() self.modification_extractor = modification_extractor def resolve_special_base_type(self, residue): if residue.superclass == SuperClass.non: if residue.stem == (Stem.gro, Stem.gal): substituents = [sub.name for p, sub in residue.substituents() if not sub._derivatize] modifications = [mod for p, mod in residue.modifications.items()] if Modification.a in modifications and\ Modification.keto in modifications and\ Modification.d in modifications: if len(substituents) == 0: return "Kdn" elif "n_acetyl" in substituents: return "Neu" # Ac elif "n_glycolyl" in substituents: return "Neu" # Gc elif "amino" in substituents: return "Neu" # _ elif residue.superclass == SuperClass.oct: if residue.stem == (Stem.man,): if Modification.a in residue.modifications[1] and\ Modification.keto in residue.modifications[2] and\ Modification.d in residue.modifications[3]: return "Kdo" elif residue.stem == (Stem.gal,): if Modification.d in residue.modifications.values(): return "Fuc" elif residue.stem == (Stem.man,): if Modification.d in residue.modifications.values(): return "Rha" elif residue.stem == (Stem.glc,): if Modification.d in residue.modifications.values(): return "Qui" query = special_base_type_resolver.query(residue) if query: return special_base_type_resolver.name_map[query] return None def monosaccharide_to_iupac(self, residue): template = "{anomer}-{configuration}-{modification}{base_type}{ring_type}{substituent}" anomer = anomer_map_to[residue.anomer] if residue.configuration[0] is Configuration.Unknown: configuration = "?" else: configuration = residue.configuration[0].name.upper() modification = "" base_type = self.resolve_special_base_type(residue) if base_type is None: if len(residue.stem) == 1 and residue.stem[0] is not Stem.Unknown: base_type = residue.stem[0].name.title() else: base_type = residue.superclass.name.title() modification = self.modification_extractor(residue.modifications, base_type) ring_type = residue.ring_type.name[0] substituent = self.substituent_resolver(residue) return template.format( anomer=anomer, configuration=configuration, modification=modification, base_type=base_type, ring_type=ring_type, substituent=substituent ) def __call__(self, residue): return self.monosaccharide_to_iupac(residue)
[docs]class DerivatizationAwareMonosaccharideSerializer(MonosaccharideSerializer): """A derivatization aware version of :class:`MonosaccharideSerializer` which deviates from the standard IUPAC code to encode derivatization. If a :class:`~.Monosaccharide` object has a derivatizing substituent attached to it, as detected by :func:`~.has_derivatization`, those substituent groups will normally be ignored. With this subclass, a single entry will be appended to the monosaccharide encoding joined by an "^" character. For example a permethylated hexose would be written "Hex^Me". """ def monosaccharide_to_iupac(self, residue): string = super(DerivatizationAwareMonosaccharideSerializer, self).monosaccharide_to_iupac(residue) deriv = has_derivatization(residue) if deriv: string = "%s^%s" % (string, self.substituent_resolver.serialize_substituent(deriv)) return string
[docs]class SimpleMonosaccharideSerializer(DerivatizationAwareMonosaccharideSerializer): def monosaccharide_to_iupac(self, residue): """ Encode a subset of traits of a :class:`Monosaccharide`-like object using a limited subset of the IUPAC three letter code. Parameters ---------- residue: :class:`~Monosaccharide` The object to be encoded Returns ------- str """ template = "{modification}{base_type}{substituent}" modification = "" base_type = self.resolve_special_base_type(residue) if base_type is None: if len(residue.stem) == 1 and residue.stem[0] is not Stem.Unknown: base_type = residue.stem[0].name.title() else: base_type = residue.superclass.name.title() modification = self.modification_extractor(residue.modifications, base_type) substituent = self.substituent_resolver(residue) string = template.format( modification=modification, base_type=base_type, substituent=substituent ) deriv = has_derivatization(residue) if deriv: string = "%s^%s" % (string, self.substituent_resolver.serialize_substituent(deriv)) return string
monosaccharide_to_iupac = MonosaccharideSerializer() resolve_special_base_type = monosaccharide_to_iupac.resolve_special_base_type
[docs]class LinkageSerializer(object): def __init__(self, open_edge='-(', close_edge=')-', open_branch='[', close_branch=']'): self.open_edge = open_edge self.close_edge = close_edge self.open_branch = open_branch self.close_branch = close_branch def format_linkage(self, linkage): text = "{oe}{attach}-{linkage_pos}{ce}".format( linkage_pos=linkage.parent_position if linkage.parent_position != UnknownPosition else "?", attach=linkage.child_position if linkage.child_position != UnknownPosition else "?", oe=self.open_edge, ce=self.close_edge) return text def format_branch(self, branch): branch = '{ob}{branch}{cb}'.format( branch=''.join(branch), ob=self.open_branch, cb=self.close_branch ) return branch
[docs]class SimpleLinkageSerializer(LinkageSerializer): def __init__(self, open_edge="(", close_edge=")", open_branch="[", close_branch="]"): super(SimpleLinkageSerializer, self).__init__(open_edge, close_edge, open_branch, close_branch) def format_linkage(self, linkage): template = "{oe}{anomer}{attach}-{linkage_pos}{ce}" text = template.format( oe=self.open_edge, anomer=anomer_map_to.get(linkage.child.anomer, "?"), linkage_pos=linkage.parent_position if linkage.parent_position != UnknownPosition else "?", attach=linkage.child_position if linkage.child_position != UnknownPosition else "?", ce=self.close_edge) return text
[docs]class GlycanSerializer(object): """Converts a :class:`~.Glycan` structure to IUPAC format. Also works on individual :class:`~.Monosaccharide` objects, but will traverse any links they have to other nodes. Attributes ---------- linkage_serializer : :class:`LinkageSerializer` An object that converts a :class:`~.Link` object into text monosaccharide_serializer : :class:`MonosaccharideSerializer` An object that converts a :class:`~.Monosaccharide` object into text """ def __init__(self, monosaccharide_serializer=None, linkage_serializer=None): if monosaccharide_serializer is None: monosaccharide_serializer = MonosaccharideSerializer() if linkage_serializer is None: linkage_serializer = LinkageSerializer() self.monosaccharide_serializer = monosaccharide_serializer self.linkage_serializer = linkage_serializer def branch_to_iupac(self, structure=None, attach=None, is_branch=False): '''Translate a |Glycan| structure's branch into IUPAC Three Letter Code. Recursively operates on branches. Parameters ---------- structure: :class:`~.Glycan` or :class:`~.Monosaccharide` The glycan to be translated. Translation starts from `glycan.root` if `structure` is a |Glycan|. May also be a :class:`~.Monosaccharide` which is the root of a branch of the overall structure. attach: int The point from the structure tree is attached to its parent. Used for recursively handling branches. Defaults to |None|. is_branch: :class:`bool` Whether this structure contains the root of the overall structure or a branch Returns ------- :class:`collections.deque` ''' base = structure.root if isinstance(structure, Glycan) else structure stack = [(attach, base)] outstack = deque() while(len(stack) > 0): outedge, node = stack.pop() link = "" if outedge is not None: link = self.linkage_serializer.format_linkage(outedge) # Branch linkage does not start with leading dash if is_branch and link[-1] == '-': link = link[:-1] outstack.appendleft('{node}{link}'.format(node=self.monosaccharide_serializer(node), link=link)) # Reset for next pass through the loop is_branch = False children = list((p, link) for p, link in node.links.items() if link.is_parent(node)) if len(children) > 1: for pos, link in children[:-1]: branch = self.linkage_serializer.format_branch( self.branch_to_iupac(link.child, link, is_branch=True)) outstack.appendleft(branch) pos, link = children[-1] stack.append((link, link.child)) elif len(children) == 1: pos, link = children[0] stack.append((link, link.child)) return outstack def glycan_to_iupac(self, structure, **kwargs): '''Translate a |Glycan| structure's branch into IUPAC Three Letter Code.structure Calls :meth:`branch_to_iupac`, a recursive function. Parameters ---------- structure: Glycan or Monosaccharide The glycan to be translated. Translation starts from `glycan.root` if `structure` is a |Glycan|. Returns ------- :class:`str` ''' return ''.join(self.branch_to_iupac(structure)) def __call__(self, structure): """An alias for :meth:`glycan_to_iupac` """ return self.glycan_to_iupac(structure)
glycan_to_iupac = GlycanSerializer() glycan_to_iupac_simple = GlycanSerializer(SimpleMonosaccharideSerializer(), SimpleLinkageSerializer()) def to_iupac(structure, dialect=None): """Translate `structure` into its textual representation using IUPAC Three Letter Code Parameters ---------- structure : |Glycan| or |Monosaccharide| The structure to be translated dialect: :class:`str` One of "extended" or "simple", controlling whether the long-form linkage and monosaccharide notation is used, or the more compact simplified form is used. Defaults to "extended". Returns ------- |str| See Also -------- :class:`GlycanSerializer` """ if dialect is None: dialect = 'extended' if isinstance(structure, Monosaccharide): return monosaccharide_to_iupac(structure) else: if dialect == 'simple': return glycan_to_iupac_simple(structure) return glycan_to_iupac(structure) def aminate_substituent(substituent): if substituent.name.startswith("n_"): # already aminated return substituent aminated = Substituent("n_" + substituent.name) if aminated.composition == {}: raise ValueError("Could not aminate substituent") return aminated
[docs]class SubstituentDeserializer(object): def __init__(self, substituents_map=None, error_on_missing=True): if substituents_map is None: substituents_map = substituents_map_from self.error_on_missing = error_on_missing self.substituents_map = substituents_map def substituent_from_iupac(self, substituents): parts = re.split(r"\(|\)", substituents) for part in parts: if part == "": continue # split_part = re.split(r"(\d+)?", part) split_part = re.split(r"(\d+)", part) if len(split_part) == 3: _, position, name = split_part else: position = UnknownPosition name = split_part[0] try: name = self.substituents_map[name] except KeyError: # Acidic special case: # Often, acidic monosaccharides are written with a trailing A like a substituent while # GlycoCT treats acidic groups as modifications. If an A appears in the substituent suffix # it will fail to be cast as a substituent, but pass it along raw and it will be handled # downstream by :func:`monosaccharide_from_iupac`. if name == "A": pass elif name.startswith("O"): if name[1:] in self.substituents_map: # Some dialects prefix non-amine substituents with O to differentiate them name = self.substituents_map[name[1:]] else: # pragma: no cover if not self.error_on_missing: warnings.warn("No translation rule found to convert %s into a Substituent" % name) continue else: raise IUPACError("No translation rule found to convert %s into a Substituent" % name) yield int(position), name def symbol_to_name(self, symbol): name = self.substituents_map[symbol] return name def __call__(self, substituents): return self.substituent_from_iupac(substituents)
substituent_from_iupac = SubstituentDeserializer() LinkageSpecification = namedtuple("LinkageSpecification", ("child_position", "parent_position", "has_ambiguity")) class LinkageDeserializer(object): pattern = re.compile(r"\((?P<child_linkage>[0-9?/]+)->?(?P<parent_linkage>[0-9?/]+)?\)?") def parse(self, linkage_string): if linkage_string is None: return None match = self.pattern.search(linkage_string) if match is None: raise ValueError(linkage_string) has_ambiguity = "/" in linkage_string match_groups = match.groupdict() child_linkage = match_groups['child_linkage'] if child_linkage is not None: child_linkage = self.parse_position(child_linkage) parent_linkage = match_groups['parent_linkage'] if parent_linkage is not None: parent_linkage = self.parse_position(parent_linkage) return LinkageSpecification(child_linkage, parent_linkage, has_ambiguity) def parse_position(self, position_string): if position_string == '?': return UnknownPosition elif '/' in position_string: return list(map(int, position_string.split('/'))) else: return int(position_string) def __call__(self, linkage_string): return self.parse(linkage_string)
[docs]class SimpleLinkageDeserializer(LinkageDeserializer): pattern = re.compile(r"""\((?P<anomer>[abo?]) (?P<child_linkage>[0-9?/]+)->? (?P<parent_linkage>[0-9?/]+)?\)?""", re.VERBOSE)
parse_linkage_structure = LinkageDeserializer()
[docs]class MonosaccharideDeserializer(object): _pattern = r'''(?:(?P<anomer>[abo?]|alpha|beta|\u03B1|\u03B2)-)? (?P<configuration>[LD?])- (?P<modification>[a-z0-9_\-,]*?) (?P<base_type>(?:[A-Z][a-z]{2}?|(?:[a-z]{3}[A-Z][a-z]{2}))) (?P<ring_type>[xpfo?])? (?P<substituent>[^-]*?) (?P<linkage>-\([0-9?/]+->?[0-9?/]+\)-?)? $''' try: # convert to unicode for Py2 _pattern = _pattern.decode("raw_unicode_escape") except AttributeError: pass pattern = re.compile(_pattern, re.VERBOSE | re.UNICODE) linkage_parser = parse_linkage_structure def __init__(self, modification_parser=None, substituent_parser=None): if modification_parser is None: modification_parser = ModificationDeserializer() self.modification_parser = modification_parser if substituent_parser is None: substituent_parser = SubstituentDeserializer() self.substituent_parser = substituent_parser def has_pattern(self, string): return self.pattern.search(string) def extract_pattern(self, monosaccharide_str): match = self.pattern.search(monosaccharide_str) if match is None: raise IUPACError("Cannot find monosaccharide pattern in {}".format(monosaccharide_str)) match_dict = match.groupdict() return match_dict def ring_bounds(self, residue, ring_type): if residue.ring_start == UnknownPosition: residue.ring_end = UnknownPosition elif ring_type == 'p': residue.ring_end = residue.ring_start + 4 elif ring_type == 'f': residue.ring_end = residue.ring_start + 3 elif ring_type == 'o': residue.ring_end = residue.ring_start = 0 else: residue.ring_end = residue.ring_start = UnknownPosition def build_residue(self, match_dict): try: anomer = anomer_map_from[match_dict['anomer']] except KeyError: anomer = anomer_map_from['?'] base_type = match_dict["base_type"] configuration = match_dict["configuration"].lower() ring_type = match_dict['ring_type'] modification = (match_dict['modification'] or '').rstrip("-") linkage = match_dict.get("linkage") original_base_type = base_type # alternate carbon backbone size encoded as stem{3}Superclass{3} # instead of Stem{3} if len(base_type) == 6: superclass_type = base_type[3:].lower() base_type = base_type[:3].title() else: superclass_type = None try: residue = named_structures.monosaccharides[base_type] except KeyError: raise IUPACError("Unknown Residue Base-type %r" % (original_base_type,)) base_is_modified = len(residue.substituent_links) + len(residue.modifications) > 0 if superclass_type is not None: residue.superclass = superclass_type residue.ring_start = UnknownPosition residue.ring_end = UnknownPosition if len(residue.configuration) == 1: residue.configuration = (configuration,) residue.anomer = anomer self.ring_bounds(residue, ring_type) self.set_modifications(residue, modification) self.set_substituents(residue, match_dict['substituent'], base_is_modified, base_type) return residue, linkage def set_substituents(self, residue, substituent_string, base_is_modified, base_type): i = 0 for position, substituent in self.substituent_parser(substituent_string): i += 1 if position == UnknownPosition and base_is_modified: # Guess at what the user might mean using base_type if base_type == "Neu" and substituent in ["acetyl", "glycolyl"] and i == 1: position = 5 # else: # raise ValueError( # "Cannot have ambiguous location of substituents on a base type which" # " has default modifications or substituents. {} {}".format( # residue, (position, substituent))) # Often, acidic monosaccharides will be suffixed "A" instead of prefixed "a". # Handle this here. if substituent == "A": residue.add_modification(Modification.a, position) continue substituent = Substituent(substituent) try: residue.add_substituent( substituent, position, parent_loss=substituent.attachment_composition_loss(), child_loss='H') except ValueError: # Highly modified large bases have a degenerate encoding, where additional qualifications following # base name *replace* an existing substituent. This behavior may not be expected in other more # common cases. if base_type in {"Neu", "Kdo"}: occupancy = 0 try: unplaced = residue.substituent_links[position][0].child residue.drop_substituent(position) if unplaced.name == "amino": try: substituent = aminate_substituent(substituent) except ValueError: pass except ValueError: # The site contains a modification which can be present alongside the substituent occupancy = 1 except IndexError: occupancy = 1 try: residue.add_substituent( substituent, position, occupancy, parent_loss=substituent.attachment_composition_loss(), child_loss='H') except ValueError: raise IUPACError("Can't resolve %s" % substituent) else: raise def set_modifications(self, residue, modification_string): for pos, mod in self.modification_parser(modification_string): residue.add_modification(mod, pos) def parse_linkage_structure(self, linkage): return self.linkage_parser(linkage) def monosaccharide_from_iupac(self, monosaccharide_str, parent=None): match_dict = self.extract_pattern(monosaccharide_str) residue, linkage = self.build_residue(match_dict) linkage = self.parse_linkage_structure(linkage) self.add_monosaccharide_bond(residue, parent, linkage) return residue, linkage def add_monosaccharide_bond(self, residue, parent, linkage): if parent is not None and linkage != (): if linkage.has_ambiguity: bond = AmbiguousLink( parent, residue, parent_position=linkage.parent_position, child_position=linkage.child_position, parent_loss=Composition("H"), child_loss=Composition("OH")) bond.find_open_position() else: parent.add_monosaccharide(residue, position=linkage[1], child_position=linkage[0]) def __call__(self, monosaccharide_str, parent=None): return self.monosaccharide_from_iupac(monosaccharide_str, parent=parent) def finalize(self, glycan): pass
[docs]class DerivatizationAwareMonosaccharideDeserializer(MonosaccharideDeserializer): _pattern = r'''(?:(?P<anomer>[abo?]|alpha|beta|\u03B1|\u03B2)-)? (?P<configuration>[LD?])- (?P<modification>[a-z0-9_\-,]*) (?P<base_type>[^-]{3}?) (?P<ring_type>[xpfo?])? (?P<substituent>[^-]*?) (?P<derivatization>\^[^\s-]*?)? (?P<linkage>-\([0-9?/]+->?[0-9?/]+\)-?)?$''' try: # convert to unicode for Py2 _pattern = _pattern.decode("raw_unicode_escape") except AttributeError: pass pattern = re.compile(_pattern, re.VERBOSE | re.UNICODE) def add_monosaccharide_bond(self, residue, parent, linkage): if parent is not None and linkage != (): try: if linkage.has_ambiguity: bond = AmbiguousLink( parent, residue, parent_position=linkage.parent_position, child_position=linkage.child_position, parent_loss=Composition("H"), child_loss=Composition("OH")) bond.find_open_position() else: parent.add_monosaccharide(residue, position=linkage[1], child_position=linkage[0]) except ValueError: parent_substituent_links_at_site = parent.substituent_links[linkage[1]] if (parent_substituent_links_at_site and parent_substituent_links_at_site[0].child._derivatize): parent.drop_substituent(linkage[1], parent_substituent_links_at_site[0].child) residue_substituent_links_at_site = residue.substituent_links[linkage[0]] if residue_substituent_links_at_site and residue_substituent_links_at_site[0].child._derivatize: residue.drop_substituent(linkage[0], residue_substituent_links_at_site[0].child) if linkage.has_ambiguity: bond = AmbiguousLink( parent, residue, parent_position=linkage.parent_position, child_position=linkage.child_position, parent_loss=Composition("H"), child_loss=Composition("OH")) bond.find_open_position() else: parent.add_monosaccharide(residue, position=linkage[1], child_position=linkage[0]) def apply_derivatization(self, residue, deriv): if deriv.startswith("^"): deriv = deriv[1:] deriv = self.substituent_parser.symbol_to_name(deriv) derivatize(residue, deriv) else: raise IUPACError("Derivatization Extension Must Start with '^'") def monosaccharide_from_iupac(self, monosaccharide_str, parent=None): match_dict = self.extract_pattern(monosaccharide_str) residue, linkage = self.build_residue(match_dict) linkage = self.parse_linkage_structure(linkage) self.add_monosaccharide_bond(residue, parent, linkage) deriv = match_dict.get("derivatization", '') if deriv is not None and deriv != "": self.apply_derivatization(residue, deriv) return residue, linkage def finalize(self, glycan): for node in glycan: neg_capacity = -node._remaining_capacity() if neg_capacity > 0: unknowns = node.substituent_links[UnknownPosition] to_remove = [] for unknown in unknowns: if unknown.child.node_type is Substituent.node_type and unknown.child._derivatize: if neg_capacity > 0: to_remove.append(unknown) neg_capacity -= 1 else: break for link_to_remove in to_remove: link_to_remove.break_link(refund=True) if neg_capacity > 0: raise ValueError("Could not completely remove overload from %s" % (node,)) def strip_derivatization(self, residue_str, **kwargs): base = residue_str.rsplit("^")[0] return self(base, **kwargs)
[docs]class SimpleMonosaccharideDeserializer(DerivatizationAwareMonosaccharideDeserializer): pattern = re.compile( r'''(?P<modification>[a-z0-9_\-,]*) (?P<base_type>(?:[A-Z][a-z]{2}?|(?:[a-z]{3}[A-Z][a-z]{2}))) (?P<ring_type>[pfox])? (?P<substituent>[^-]*?) (?P<derivatization>\^[^\s-]*?)? (?P<linkage>-?\((?P<anomer>[ab?o]?)[0-9?/]+->?[0-9?/]+\)-?)?$''', re.VERBOSE) linkage_parser = SimpleLinkageDeserializer() def parse_linkage_structure(self, linkage): return self.linkage_parser(linkage) def build_residue(self, match_dict): base_type = match_dict["base_type"] modification = (match_dict['modification'] or '').rstrip("-") try: residue = named_structures.monosaccharides[base_type] except KeyError: raise IUPACError("Unknown Residue Base-type %r" % (base_type,)) base_is_modified = len(residue.substituent_links) + len(residue.modifications) > 0 ring_type = match_dict.get('ring_type') if ring_type is not None: self.ring_bounds(residue, ring_type) self.set_modifications(residue, modification) self.set_substituents(residue, match_dict['substituent'], base_is_modified, base_type) linkage = match_dict.get("linkage") return residue, linkage
monosaccharide_from_iupac = MonosaccharideDeserializer()
[docs]class GlycanDeserializer(object): def __init__(self, monosaccharide_deserializer=None, set_default_positions=True): if monosaccharide_deserializer is None: monosaccharide_deserializer = MonosaccharideDeserializer() self.monosaccharide_deserializer = monosaccharide_deserializer self.set_default_positions = set_default_positions new_branch_open = re.compile(r"(\]-?)$") def add_monosaccharide(self, parent_node, child_node, linkage): # parent_node.add_monosaccharide( # child_node, position=parent_position, child_position=child_position) self.monosaccharide_deserializer.add_monosaccharide_bond( child_node, parent_node, linkage) def glycan_from_iupac(self, text, structure_class=Glycan, **kwargs): last_outedge = None root = None last_residue = None branch_stack = [] # Remove the base text = re.sub(r"\((\d*|\?)->?$", "", text) while len(text) > 0: # If starting a new branch match = self.new_branch_open.search(text) if match is not None: step = match.end(1) - match.start(1) text = text[:-step] branch_stack.append((last_residue, root, last_outedge)) root = None last_residue = None last_outedge = None # If ending a branch elif text[-1] == '[': try: branch_parent, old_root, old_last_outedge = branch_stack.pop() # child_position, parent_position = last_outedge self.add_monosaccharide(branch_parent, root, last_outedge) root = old_root last_residue = branch_parent last_outedge = old_last_outedge text = text[:-1] except IndexError: raise IUPACError("Bad branching at {}".format(len(text))) # Parsing a residue else: match = self.monosaccharide_deserializer.has_pattern(text) if match: next_residue, outedge = self.monosaccharide_deserializer( text[match.start(): match.end()], last_residue) if root is None: last_outedge = outedge root = next_residue last_residue = next_residue text = text[:match.start()] else: raise IUPACError("Could not identify residue '...{}' at {}".format(text[-30:], len(text))) res = structure_class(root=root) self.monosaccharide_deserializer.finalize(res) res.reindex() if self.set_default_positions: self.set_default_positions_for_common_cases(res) return res def __call__(self, text, **kwargs): return self.glycan_from_iupac(text, **kwargs) def set_default_positions_for_common_cases(self, glycan): for node in glycan: candidates = [] for pos, link in list(node.substituent_links.items()): if pos == UnknownPosition: candidates.append(link) for candidate in candidates: substituent = candidate.to(node) position = None if substituent.name == 'n_acetyl': if node.superclass == SuperClass.hex: position = 2 elif node.superclass == SuperClass.non: position = 5 if position is None: continue if not node.is_occupied(position): candidate.break_link(refund=True) candidate.parent_position = position candidate.apply() return glycan
def set_default_positions(glycan): # pragma: nocover for node in glycan: candidates = [] for pos, link in list(node.substituent_links.items()): if pos == UnknownPosition: candidates.append(link) for candidate in candidates: substituent = candidate.to(node) position = None if substituent.name == 'n_acetyl': if node.superclass == SuperClass.hex: position = 2 elif node.superclass == SuperClass.non: position = 5 if position is None: continue if not node.is_occupied(position): candidate.break_link(refund=True) candidate.parent_position = position candidate.apply() return glycan glycan_from_iupac = GlycanDeserializer() glycan_from_iupac_simple = GlycanDeserializer(SimpleMonosaccharideDeserializer()) def from_iupac(text, structure_class=Glycan, resolve_default_positions=True, dialect=None, **kwargs): """Parse the given text into an instance of |Glycan|. If there is only a single monosaccharide in the output, just the Monosaccharide instance is returned. Parameters ---------- text : |str| The text to parser resolve_default_positions: :class:`bool` Whether to assume default positions for common monosaccharide modifiers that are omitted for brevity, such as the postion of n-acetyl on HexNAc. dialect: :class:`str` One of "extended" or "simple", controlling whether the long-form linkage and monosaccharide notation is used, or the more compact simplified form is used. Defaults to "extended". **kwargs: Forwarded to :func:`glycan_from_iupac` Returns ------- |Glycan| or |Monosaccharide| If the resulting structure is just a single monosaccharide, the returned value is a Monosaccharide. """ if dialect is None: dialect = 'extended' if dialect != 'simple': res = glycan_from_iupac( text, structure_class=structure_class, set_default_positions=resolve_default_positions, **kwargs) else: res = glycan_from_iupac_simple( text, structure_class=structure_class, set_default_positions=resolve_default_positions, **kwargs) if len(res) > 1: return res else: return res.root loads = from_iupac dumps = to_iupac class IUPACParser(ParserInterface): def process_result(self, line): structure = loads(line) return structure load = IUPACParser.load Monosaccharide.register_serializer("iupac", dumps) Glycan.register_serializer("iupac", dumps) _dumps_simple = partial(dumps, dialect='simple') _dumps_extended = partial(dumps, dialect='extended') Monosaccharide.register_serializer("iupac_simple", _dumps_simple) Glycan.register_serializer("iupac_simple", _dumps_simple) Monosaccharide.register_serializer("iupac_extended", _dumps_extended) Glycan.register_serializer("iupac_extended", _dumps_extended)