Source code for glypy.io.iupac

import re
import warnings

from collections import deque, namedtuple
from functools import partial

from glypy.structure import (
    Monosaccharide, Glycan, Link, AmbiguousLink,
    Substituent, constants, named_structures, UnknownPosition)
from glypy.composition import Composition
from glypy.composition.structure_composition import substituent_compositions
from glypy.composition.composition_transform import has_derivatization, derivatize
from glypy.io import format_constants_map
from glypy.io.nomenclature import identity
from glypy.utils import invert_dict

from glypy.io.file_utils import ParserInterface, ParserError


# A static copy of monosaccharide names to structures for copy-free comparison
monosaccharide_reference = {k: v for k, v in named_structures.monosaccharides.items()}

special_base_types = {
    # "Neu5Ac", "Neu5Gc", "Neu",
    # "Kdn", "Kdo",
    "Oli", "Tyv",
    "Psi", "Fru", "Sor", "Tag",
    "Xul", "Sed"
}

special_base_types = {
    s: monosaccharide_reference[s]
    for s in special_base_types
}

special_base_type_resolver = identity.MonosaccharideIdentifier(special_base_types)

anomer_map_from = dict(format_constants_map.anomer_map)
anomer_map_from['?'] = anomer_map_from.pop('x')
anomer_map_to = invert_dict(anomer_map_from)
anomer_map_from['beta'] = anomer_map_from['b']
anomer_map_from['alpha'] = anomer_map_from['a']
anomer_map_from[u"\u03B1"] = anomer_map_from['a']
anomer_map_from[u"\u03B2"] = anomer_map_from['b']


Stem = constants.Stem
Configuration = constants.Configuration
Modification = constants.Modification
SuperClass = constants.SuperClass


def tryint(i):
    try:
        return int(i)
    except ValueError:
        return -1


[docs]class IUPACError(ParserError):
    pass


def _make_substituent_name(name):
    return ''.join(t.title() for t in name.split("_")).replace("(", "").replace(")", "")


substituents_map_to = {
    name: _make_substituent_name(name) for name in substituent_compositions
}

# Special Cases
substituents_map_to['n_acetyl'] = "NAc"
substituents_map_to['n_glycolyl'] = "NGc"
substituents_map_to['n_sulfate'] = "NS"
substituents_map_to['sulfate'] = "S"
substituents_map_to["methyl"] = "Me"
substituents_map_to["acetyl"] = "Ac"
substituents_map_to["glycolyl"] = "Gc"
substituents_map_to["fluoro"] = "F"
substituents_map_to["amino"] = "N"
substituents_map_to['phosphate'] = 'P'
substituents_map_to['phospho_ethanolamine'] = 'PEtn'
substituents_map_to['ethanolamine'] = 'Etn'

substituents_map_from = invert_dict(substituents_map_to)
substituents_map_from['Phosphate'] = 'phosphate'

_modification_map_to = {
    'deoxy': 'd',
}


_substituent_replacement_rules = {
    'NeuAc': [
        ('n_acetyl', 'acetyl')
    ],
    'NeuGc': [
        ('n_glycolyl', 'glycolyl')
    ],
    'Neu': [
        ('amino', None)
    ]
}


[docs]class SubstituentSerializer(object):
    """Build the textual encoding for the relevant substituents for
    a provided monosaccharide.

    Attributes
    ----------
    monosaccharide_reference : :class:`dict`
        Map base type to :class:`~.Monosaccharide`
    """

    def __init__(self, monosaccharides=None, substitution_rules=None, substituent_map=None):
        if monosaccharides is None:
            monosaccharides = monosaccharide_reference
        if substitution_rules is None:
            substitution_rules = _substituent_replacement_rules
        if substituent_map is None:
            substituent_map = substituents_map_to
        self.monosaccharide_reference = monosaccharides
        self.substitution_rules = substitution_rules
        self.substituent_map = substituent_map

    def __call__(self, residue, **kwargs):
        """Alias for :meth:`resolve_substituents`
        """
        return self.resolve_substituents(residue, **kwargs)

    def serialize_substituent(self, substituent):
        """Obtain an IUPAC-compatible name for ``substituent``

        Parameters
        ----------
        substituent : :class:`~.Substituent`
            The subsituent group to get the name for

        Returns
        -------
        :class:`str`
        """
        name = substituent.name
        if name in self.substituent_map:
            part = self.substituent_map[name]
        else:
            part = _make_substituent_name(name)
            warnings.warn("Registering IUPAC  name %r for %r" % (name, part))
            self.substituent_map[name] = part
            if part not in substituents_map_from:
                substituents_map_from[part] = name
        return part

    def resolve_substituents(self, residue, **kwargs):
        """Build a textual encoding of the substituent list for ``residue``.

        Parameters
        ----------
        residue : :class:`~.Monosaccharide`
            The residue to build the substituent list for

        Returns
        -------
        :class:`str`
        """
        substituent = ""
        multi = False
        for name, pos in self.get_relevant_substituents(residue):
            if pos in {UnknownPosition, None}:
                pos = ""
            if name in self.substituent_map:
                part = self.substituent_map[name]
            else:
                part = _make_substituent_name(name)
                warnings.warn("Registering IUPAC  name %r for %r" %
                              (name, part))
                self.substituent_map[name] = part
                if part not in substituents_map_from:
                    substituents_map_from[part] = name
            # If there is a substituent after the first, successive ones are placed in parentheses
            if multi:
                substituent += "({}{})".format(pos, part)
            else:
                substituent += "{}{}".format(pos, part)
                multi = True
        return substituent

    def _test_for_replacement(self, residue, reference, positions, substituents, replacements, exact=False):
        if identity.is_a(residue, reference, exact=exact, short_circuit=True):
            self._substituent_replacement(positions, substituents, replacements)

    def _substituent_replacement(self, positions, substituents, pairs):
        for target, replacement in pairs:
            try:
                i = substituents.index(target)
                substituents.pop(i)
                j = positions.pop(i)
                if replacement is not None:
                    substituents.insert(i, replacement)
                    positions.insert(i, j)
            except Exception:  # pragma: no cover
                pass

    def get_relevant_substituents(self, residue):
        '''
        Retrieve the set of substituents not implicitly included
        in the base type's symbol name.

        Certain base types have implied substituent groups or partial substituent
        groups. For example, from the perspective of :mod:`glypy`, "n-acetyl" is
        a discrete unit, but from a structural perspective it is a substituted amine
        group that was later acetylated. The "Neu" base type implies an amination of
        carbon 5. In "Neu5Ac", the amine at carbon 5 is acetylated, but because the
        amine is implied, the N of the "NAc" signifier is omitted.

        In IUPAC's trivial coding, substituents are listed following
        the base type, with each substituent encoded as an optional position
        specifier followed immediately by a shortened version of the substituent group's
        name. The first substituent immediately follows the base type, and subsequent
        substituent groups are enclosed in parentheses.
        '''
        monosaccharides = self.monosaccharide_reference
        positions = [p for p, sub in residue.substituents() if not sub._derivatize]
        substituents = [sub.name for p, sub in residue.substituents() if not sub._derivatize]

        for reference_name, replacements in self.substitution_rules.items():
            reference = monosaccharides[reference_name]
            self._test_for_replacement(residue, reference, positions, substituents, replacements)
        return zip(substituents, positions)


resolve_substituent = SubstituentSerializer()


[docs]class ModificationSerializer(object):
    """Build the textual encoding for the relevant modifications for
    a provided monosaccharide base type.
    """

    def extract_modifications(self, modifications, base_type):
        """Build a string representing the relevant modifications for.

        Certain base types imply a collection of modifications by default. These
        modifications should not be included in the textual encoding.

        In IUPAC's trivial coding, modifications are specified as a comma-separated
        list preceding the base definition with an optional position designation linked
        by a dash character to the modification name. For example "deoxy" and "?-deoxy"
        both signify a deoxidation at an unknown position, and "6-deoxy" indicates the
        deoxidation appears at carbon 6.

        Parameters
        ----------
        modifications : :class:`~.MultiMap`
            A mapping between position and modifications
        base_type : :class:`str`
            The monosaccharide base type to build the list for. Uses
            a hard-coded list of modifications

        Returns
        -------
        :class:`str`
        """
        buff = []
        template = '{position}-{name}'
        pos_mod_pairs = list(modifications.items())
        try:
            pos, mods = map(list, zip(*pos_mod_pairs))
        except ValueError:
            pos, mods = [], []
        if "Neu" in base_type or "Kd" in base_type:
            for mod in [Modification.d, Modification.keto, Modification.a]:
                try:
                    pop_ix = mods.index(mod)
                    pos.pop(pop_ix)
                    mods.pop(pop_ix)
                except Exception:  # pragma: no cover
                    pass

        elif "Fuc" in base_type or "Qui" in base_type or "Rha" in base_type:
            for mod in [Modification.d]:
                pop_ix = mods.index(mod)
                pos.pop(pop_ix)
                mods.pop(pop_ix)
        elif "Fru" in base_type or "Psi" in base_type or "Sor" in base_type or "Tag" in base_type:
            for mod in [Modification.keto]:
                pop_ix = mods.index(mod)
                pos.pop(pop_ix)
                mods.pop(pop_ix)
        pos_mod_pairs = zip(pos, mods)
        for pos, mod in pos_mod_pairs:
            if pos != UnknownPosition:
                buff.append(template.format(position=pos, name=mod.name))
            else:
                buff.append(mod.name)
        out = ','.join(buff)
        if out:
            out += '-'
        return out

    def __call__(self, modifications, base_type):
        """An alias for :meth:`extract_modifications`
        """
        return self.extract_modifications(modifications, base_type)


extract_modifications = ModificationSerializer()


[docs]class ModificationDeserializer(object):
    """Parses modification signifiers from text into position, :class:`~.Modification` pairs

    Attributes
    ----------
    modification_map : :class:`dict`
        Mapping from text representation to :class:`~.Modification` to provide additional
        names for the existing modification name mapping.
    """

    def __init__(self, modification_map=None):
        if modification_map is None:
            modification_map = _modification_map_to.copy()
        else:
            t = _modification_map_to.copy()
            t.update(modification_map)
            modification_map = t
        self.modification_map = modification_map

    def parse_modifications(self, modification_string):
        """Parses the text for site-modification definitions.

        In IUPAC's trivial coding, modifications are specified as a comma-separated
        list preceding the base definition with an optional position designation linked
        by a dash character to the modification name. For example "deoxy" and "?-deoxy"
        both signify a deoxidation at an unknown position, and "6-deoxy" indicates the
        deoxidation appears at carbon 6.

        Parameters
        ----------
        modification_string : str

        Returns
        -------
        list:
            The list of (position, modification) pairs parsed from the string

        Raises
        ------
        IUPACError:
            If the modification signifier cannot be translated
        """
        buff = modification_string.split(",")
        pairs = []
        for token in buff:
            if token == '':
                continue
            try:
                pos, mod = token.split("-")
            except Exception:
                pos = UnknownPosition
                mod = token
            try:
                mod_t = self.modification_map.get(mod, mod)
                pos = int(pos)
                pairs.append((pos, Modification[mod_t]))
            except KeyError:
                raise IUPACError("Could not determine modification from %s" % modification_string)
        return pairs

    def __call__(self, modification_string):
        """An alias for :meth:`parse_modifications`
        """
        return self.parse_modifications(modification_string)


parse_modifications = ModificationDeserializer()


[docs]class MonosaccharideSerializer(object):
    """Serialize a :class:`~.Monosaccharide` object to IUPAC text

    Attributes
    ----------
    modification_extractor: :class:`ModificationSerializer`
        Convert modifications to a text list
    monosaccharide_reference : :class:`dict`
        Map base type to :class:`~.Monosaccharide`
    substituent_resolver : :class:`SubstituentSerializer`
        Convert substituents to a text list
    """

    def __init__(self, monosaccharides=None, substituent_resolver=None, modification_extractor=None):
        if monosaccharides is None:
            monosaccharides = monosaccharide_reference
        self.monosaccharide_reference = monosaccharides
        if substituent_resolver is None:
            substituent_resolver = SubstituentSerializer(monosaccharides)
        self.substituent_resolver = substituent_resolver
        if modification_extractor is None:
            modification_extractor = ModificationSerializer()
        self.modification_extractor = modification_extractor

    def resolve_special_base_type(self, residue):
        if residue.superclass == SuperClass.non:
            if residue.stem == (Stem.gro, Stem.gal):
                substituents = [sub.name for p, sub in residue.substituents() if not sub._derivatize]
                modifications = [mod for p, mod in residue.modifications.items()]
                if Modification.a in modifications and\
                   Modification.keto in modifications and\
                   Modification.d in modifications:
                    if len(substituents) == 0:
                        return "Kdn"
                    elif "n_acetyl" in substituents:
                        return "Neu"  # Ac
                    elif "n_glycolyl" in substituents:
                        return "Neu"  # Gc
                    elif "amino" in substituents:
                        return "Neu"  # _

        elif residue.superclass == SuperClass.oct:
            if residue.stem == (Stem.man,):
                if Modification.a in residue.modifications[1] and\
                   Modification.keto in residue.modifications[2] and\
                   Modification.d in residue.modifications[3]:
                    return "Kdo"
        elif residue.stem == (Stem.gal,):
            if Modification.d in residue.modifications.values():
                return "Fuc"
        elif residue.stem == (Stem.man,):
            if Modification.d in residue.modifications.values():
                return "Rha"
        elif residue.stem == (Stem.glc,):
            if Modification.d in residue.modifications.values():
                return "Qui"
        query = special_base_type_resolver.query(residue)
        if query:
            return special_base_type_resolver.name_map[query]
        return None

    def monosaccharide_to_iupac(self, residue):
        template = "{anomer}-{configuration}-{modification}{base_type}{ring_type}{substituent}"
        anomer = anomer_map_to[residue.anomer]
        if residue.configuration[0] is Configuration.Unknown:
            configuration = "?"
        else:
            configuration = residue.configuration[0].name.upper()
        modification = ""
        base_type = self.resolve_special_base_type(residue)
        if base_type is None:
            if len(residue.stem) == 1 and residue.stem[0] is not Stem.Unknown:
                base_type = residue.stem[0].name.title()
            else:
                base_type = residue.superclass.name.title()
        modification = self.modification_extractor(residue.modifications, base_type)
        ring_type = residue.ring_type.name[0]
        substituent = self.substituent_resolver(residue)
        return template.format(
            anomer=anomer,
            configuration=configuration,
            modification=modification,
            base_type=base_type,
            ring_type=ring_type,
            substituent=substituent
        )

    def __call__(self, residue):
        return self.monosaccharide_to_iupac(residue)


[docs]class DerivatizationAwareMonosaccharideSerializer(MonosaccharideSerializer):
    """A derivatization aware version of :class:`MonosaccharideSerializer` which
    deviates from the standard IUPAC code to encode derivatization.

    If a :class:`~.Monosaccharide` object has a derivatizing substituent attached to
    it, as detected by :func:`~.has_derivatization`, those substituent groups will
    normally be ignored. With this subclass, a single entry will be appended to the
    monosaccharide encoding joined by an "^" character. For example a permethylated
    hexose would be written "Hex^Me".
    """

    def monosaccharide_to_iupac(self, residue):
        string = super(DerivatizationAwareMonosaccharideSerializer, self).monosaccharide_to_iupac(residue)
        deriv = has_derivatization(residue)
        if deriv:
            string = "%s^%s" % (string, self.substituent_resolver.serialize_substituent(deriv))
        return string


[docs]class SimpleMonosaccharideSerializer(DerivatizationAwareMonosaccharideSerializer):
    def monosaccharide_to_iupac(self, residue):
        """
        Encode a subset of traits of a :class:`Monosaccharide`-like object
        using a limited subset of the IUPAC three letter code.

        Parameters
        ----------
        residue: :class:`~Monosaccharide`
            The object to be encoded

        Returns
        -------
        str
        """
        template = "{modification}{base_type}{substituent}"
        modification = ""
        base_type = self.resolve_special_base_type(residue)
        if base_type is None:
            if len(residue.stem) == 1 and residue.stem[0] is not Stem.Unknown:
                base_type = residue.stem[0].name.title()
            else:
                base_type = residue.superclass.name.title()
        modification = self.modification_extractor(residue.modifications, base_type)
        substituent = self.substituent_resolver(residue)
        string = template.format(
            modification=modification,
            base_type=base_type,
            substituent=substituent
        )

        deriv = has_derivatization(residue)
        if deriv:
            string = "%s^%s" % (string, self.substituent_resolver.serialize_substituent(deriv))
        return string


monosaccharide_to_iupac = MonosaccharideSerializer()
resolve_special_base_type = monosaccharide_to_iupac.resolve_special_base_type


[docs]class LinkageSerializer(object):
    def __init__(self, open_edge='-(', close_edge=')-', open_branch='[', close_branch=']'):
        self.open_edge = open_edge
        self.close_edge = close_edge
        self.open_branch = open_branch
        self.close_branch = close_branch

    def format_linkage(self, linkage):
        text = "{oe}{attach}-{linkage_pos}{ce}".format(
            linkage_pos=linkage.parent_position if linkage.parent_position != UnknownPosition else "?",
            attach=linkage.child_position if linkage.child_position != UnknownPosition else "?",
            oe=self.open_edge, ce=self.close_edge)
        return text

    def format_branch(self, branch):
        branch = '{ob}{branch}{cb}'.format(
            branch=''.join(branch),
            ob=self.open_branch,
            cb=self.close_branch
        )
        return branch


[docs]class SimpleLinkageSerializer(LinkageSerializer):
    def __init__(self, open_edge="(", close_edge=")", open_branch="[", close_branch="]"):
        super(SimpleLinkageSerializer, self).__init__(open_edge, close_edge, open_branch, close_branch)

    def format_linkage(self, linkage):
        template = "{oe}{anomer}{attach}-{linkage_pos}{ce}"
        text = template.format(
            oe=self.open_edge, anomer=anomer_map_to.get(linkage.child.anomer, "?"),
            linkage_pos=linkage.parent_position if linkage.parent_position != UnknownPosition else "?",
            attach=linkage.child_position if linkage.child_position != UnknownPosition else "?",
            ce=self.close_edge)
        return text


[docs]class GlycanSerializer(object):
    """Converts a :class:`~.Glycan` structure to IUPAC format.

    Also works on individual :class:`~.Monosaccharide` objects, but
    will traverse any links they have to other nodes.

    Attributes
    ----------
    linkage_serializer : :class:`LinkageSerializer`
        An object that converts a :class:`~.Link` object into text
    monosaccharide_serializer : :class:`MonosaccharideSerializer`
        An object that converts a :class:`~.Monosaccharide` object into text
    """

    def __init__(self, monosaccharide_serializer=None, linkage_serializer=None):
        if monosaccharide_serializer is None:
            monosaccharide_serializer = MonosaccharideSerializer()
        if linkage_serializer is None:
            linkage_serializer = LinkageSerializer()
        self.monosaccharide_serializer = monosaccharide_serializer
        self.linkage_serializer = linkage_serializer

    def branch_to_iupac(self, structure=None, attach=None, is_branch=False):
        '''Translate a |Glycan| structure's branch into IUPAC Three Letter Code.
        Recursively operates on branches.

        Parameters
        ----------
        structure: :class:`~.Glycan` or :class:`~.Monosaccharide`
            The glycan to be translated. Translation starts from `glycan.root` if `structure`
            is a |Glycan|. May also be a :class:`~.Monosaccharide` which is the root of a
            branch of the overall structure.
        attach: int
            The point from the structure tree is attached to its parent. Used for recursively
            handling branches. Defaults to |None|.
        is_branch: :class:`bool`
            Whether this structure contains the root of the overall structure or a branch

        Returns
        -------
        :class:`collections.deque`
        '''
        base = structure.root if isinstance(structure, Glycan) else structure
        stack = [(attach, base)]
        outstack = deque()
        while(len(stack) > 0):
            outedge, node = stack.pop()
            link = ""
            if outedge is not None:
                link = self.linkage_serializer.format_linkage(outedge)
            # Branch linkage does not start with leading dash
            if is_branch and link[-1] == '-':
                link = link[:-1]
            outstack.appendleft('{node}{link}'.format(node=self.monosaccharide_serializer(node), link=link))
            # Reset for next pass through the loop
            is_branch = False
            children = list((p, link) for p, link in node.links.items() if link.is_parent(node))
            if len(children) > 1:
                for pos, link in children[:-1]:
                    branch = self.linkage_serializer.format_branch(
                        self.branch_to_iupac(link.child, link, is_branch=True))
                    outstack.appendleft(branch)
                pos, link = children[-1]
                stack.append((link, link.child))
            elif len(children) == 1:
                pos, link = children[0]
                stack.append((link, link.child))
        return outstack

    def glycan_to_iupac(self, structure, **kwargs):
        '''Translate a |Glycan| structure's branch into IUPAC Three Letter Code.structure

        Calls :meth:`branch_to_iupac`, a recursive function.

        Parameters
        ----------
        structure: Glycan or Monosaccharide
            The glycan to be translated. Translation starts from `glycan.root` if `structure`
            is a |Glycan|.

        Returns
        -------
        :class:`str`
        '''
        return ''.join(self.branch_to_iupac(structure))

    def __call__(self, structure):
        """An alias for :meth:`glycan_to_iupac`
        """
        return self.glycan_to_iupac(structure)


glycan_to_iupac = GlycanSerializer()

glycan_to_iupac_simple = GlycanSerializer(SimpleMonosaccharideSerializer(), SimpleLinkageSerializer())


def to_iupac(structure, dialect=None):
    """Translate `structure` into its textual representation using IUPAC Three Letter Code

    Parameters
    ----------
    structure : |Glycan| or |Monosaccharide|
        The structure to be translated
    dialect: :class:`str`
        One of "extended" or "simple", controlling whether the long-form linkage
        and monosaccharide notation is used, or the more compact simplified form
        is used. Defaults to "extended".
    Returns
    -------
    |str|

    See Also
    --------
    :class:`GlycanSerializer`
    """
    if dialect is None:
        dialect = 'extended'
    if isinstance(structure, Monosaccharide):
        return monosaccharide_to_iupac(structure)
    else:
        if dialect == 'simple':
            return glycan_to_iupac_simple(structure)
        return glycan_to_iupac(structure)


def aminate_substituent(substituent):
    if substituent.name.startswith("n_"):
        # already aminated
        return substituent
    aminated = Substituent("n_" + substituent.name)
    if aminated.composition == {}:
        raise ValueError("Could not aminate substituent")
    return aminated


[docs]class SubstituentDeserializer(object):
    def __init__(self, substituents_map=None, error_on_missing=True):
        if substituents_map is None:
            substituents_map = substituents_map_from
        self.error_on_missing = error_on_missing
        self.substituents_map = substituents_map

    def substituent_from_iupac(self, substituents):
        parts = re.split(r"\(|\)", substituents)
        for part in parts:
            if part == "":
                continue
            # split_part = re.split(r"(\d+)?", part)
            split_part = re.split(r"(\d+)", part)

            if len(split_part) == 3:
                _, position, name = split_part
            else:
                position = UnknownPosition
                name = split_part[0]
            try:
                name = self.substituents_map[name]
            except KeyError:
                # Acidic special case:
                # Often, acidic monosaccharides are written with a trailing A like a substituent while
                # GlycoCT treats acidic groups as modifications. If an A appears in the substituent suffix
                # it will fail to be cast as a substituent, but pass it along raw and it will be handled
                # downstream by :func:`monosaccharide_from_iupac`.
                if name == "A":
                    pass
                elif name.startswith("O"):
                    if name[1:] in self.substituents_map:
                        # Some dialects prefix non-amine substituents with O to differentiate them
                        name = self.substituents_map[name[1:]]
                else:  # pragma: no cover
                    if not self.error_on_missing:
                        warnings.warn("No translation rule found to convert %s into a Substituent" % name)
                        continue
                    else:
                        raise IUPACError("No translation rule found to convert %s into a Substituent" % name)
            yield int(position), name

    def symbol_to_name(self, symbol):
        name = self.substituents_map[symbol]
        return name

    def __call__(self, substituents):
        return self.substituent_from_iupac(substituents)


substituent_from_iupac = SubstituentDeserializer()


LinkageSpecification = namedtuple("LinkageSpecification", ("child_position", "parent_position", "has_ambiguity"))


class LinkageDeserializer(object):
    pattern = re.compile(r"\((?P<child_linkage>[0-9?/]+)->?(?P<parent_linkage>[0-9?/]+)?\)?")

    def parse(self, linkage_string):
        if linkage_string is None:
            return None
        match = self.pattern.search(linkage_string)
        if match is None:
            raise ValueError(linkage_string)
        has_ambiguity = "/" in linkage_string
        match_groups = match.groupdict()
        child_linkage = match_groups['child_linkage']
        if child_linkage is not None:
            child_linkage = self.parse_position(child_linkage)
        parent_linkage = match_groups['parent_linkage']
        if parent_linkage is not None:
            parent_linkage = self.parse_position(parent_linkage)
        return LinkageSpecification(child_linkage, parent_linkage, has_ambiguity)

    def parse_position(self, position_string):
        if position_string == '?':
            return UnknownPosition
        elif '/' in position_string:
            return list(map(int, position_string.split('/')))
        else:
            return int(position_string)

    def __call__(self, linkage_string):
        return self.parse(linkage_string)


[docs]class SimpleLinkageDeserializer(LinkageDeserializer):
    pattern = re.compile(r"""\((?P<anomer>[abo?])
                             (?P<child_linkage>[0-9?/]+)->?
                             (?P<parent_linkage>[0-9?/]+)?\)?""",
                         re.VERBOSE)


parse_linkage_structure = LinkageDeserializer()


[docs]class MonosaccharideDeserializer(object):
    _pattern = r'''(?:(?P<anomer>[abo?]|alpha|beta|\u03B1|\u03B2)-)?
                   (?P<configuration>[LD?])-
                   (?P<modification>[a-z0-9_\-,]*?)
                   (?P<base_type>(?:[A-Z][a-z]{2}?|(?:[a-z]{3}[A-Z][a-z]{2})))
                   (?P<ring_type>[xpfo?])?
                   (?P<substituent>[^-]*?)
                   (?P<linkage>-\([0-9?/]+->?[0-9?/]+\)-?)?
                   $'''
    try:
        # convert to unicode for Py2
        _pattern = _pattern.decode("raw_unicode_escape")
    except AttributeError:
        pass
    pattern = re.compile(_pattern, re.VERBOSE | re.UNICODE)

    linkage_parser = parse_linkage_structure

    def __init__(self, modification_parser=None, substituent_parser=None):
        if modification_parser is None:
            modification_parser = ModificationDeserializer()
        self.modification_parser = modification_parser
        if substituent_parser is None:
            substituent_parser = SubstituentDeserializer()
        self.substituent_parser = substituent_parser

    def has_pattern(self, string):
        return self.pattern.search(string)

    def extract_pattern(self, monosaccharide_str):
        match = self.pattern.search(monosaccharide_str)
        if match is None:
            raise IUPACError("Cannot find monosaccharide pattern in {}".format(monosaccharide_str))
        match_dict = match.groupdict()
        return match_dict

    def ring_bounds(self, residue, ring_type):
        if residue.ring_start == UnknownPosition:
            residue.ring_end = UnknownPosition
        elif ring_type == 'p':
            residue.ring_end = residue.ring_start + 4
        elif ring_type == 'f':
            residue.ring_end = residue.ring_start + 3
        elif ring_type == 'o':
            residue.ring_end = residue.ring_start = 0
        else:
            residue.ring_end = residue.ring_start = UnknownPosition

    def build_residue(self, match_dict):
        try:
            anomer = anomer_map_from[match_dict['anomer']]
        except KeyError:
            anomer = anomer_map_from['?']
        base_type = match_dict["base_type"]
        configuration = match_dict["configuration"].lower()
        ring_type = match_dict['ring_type']

        modification = (match_dict['modification'] or '').rstrip("-")

        linkage = match_dict.get("linkage")
        original_base_type = base_type
        # alternate carbon backbone size encoded as stem{3}Superclass{3}
        # instead of Stem{3}
        if len(base_type) == 6:
            superclass_type = base_type[3:].lower()
            base_type = base_type[:3].title()
        else:
            superclass_type = None
        try:
            residue = named_structures.monosaccharides[base_type]
        except KeyError:
            raise IUPACError("Unknown Residue Base-type %r" % (original_base_type,))
        base_is_modified = len(residue.substituent_links) + len(residue.modifications) > 0
        if superclass_type is not None:
            residue.superclass = superclass_type
            residue.ring_start = UnknownPosition
            residue.ring_end = UnknownPosition

        if len(residue.configuration) == 1:
            residue.configuration = (configuration,)

        residue.anomer = anomer
        self.ring_bounds(residue, ring_type)
        self.set_modifications(residue, modification)
        self.set_substituents(residue, match_dict['substituent'], base_is_modified, base_type)
        return residue, linkage

    def set_substituents(self, residue, substituent_string, base_is_modified, base_type):
        i = 0
        for position, substituent in self.substituent_parser(substituent_string):
            i += 1
            if position == UnknownPosition and base_is_modified:
                # Guess at what the user might mean using base_type
                if base_type == "Neu" and substituent in ["acetyl", "glycolyl"] and i == 1:
                    position = 5
                # else:
                #     raise ValueError(
                #         "Cannot have ambiguous location of substituents on a base type which"
                #         " has default modifications or substituents. {} {}".format(
                #             residue, (position, substituent)))
            # Often, acidic monosaccharides will be suffixed "A" instead of prefixed "a".
            # Handle this here.
            if substituent == "A":
                residue.add_modification(Modification.a, position)
                continue

            substituent = Substituent(substituent)
            try:
                residue.add_substituent(
                    substituent, position,
                    parent_loss=substituent.attachment_composition_loss(), child_loss='H')
            except ValueError:
                # Highly modified large bases have a degenerate encoding, where additional qualifications following
                # base name *replace* an existing substituent. This behavior may not be expected in other more
                # common cases.
                if base_type in {"Neu", "Kdo"}:
                    occupancy = 0
                    try:
                        unplaced = residue.substituent_links[position][0].child
                        residue.drop_substituent(position)
                        if unplaced.name == "amino":
                            try:
                                substituent = aminate_substituent(substituent)
                            except ValueError:
                                pass
                    except ValueError:
                        # The site contains a modification which can be present alongside the substituent
                        occupancy = 1
                    except IndexError:
                        occupancy = 1
                    try:
                        residue.add_substituent(
                            substituent, position, occupancy,
                            parent_loss=substituent.attachment_composition_loss(), child_loss='H')
                    except ValueError:
                        raise IUPACError("Can't resolve %s" % substituent)
                else:
                    raise

    def set_modifications(self, residue, modification_string):
        for pos, mod in self.modification_parser(modification_string):
            residue.add_modification(mod, pos)

    def parse_linkage_structure(self, linkage):
        return self.linkage_parser(linkage)

    def monosaccharide_from_iupac(self, monosaccharide_str, parent=None):
        match_dict = self.extract_pattern(monosaccharide_str)
        residue, linkage = self.build_residue(match_dict)
        linkage = self.parse_linkage_structure(linkage)

        self.add_monosaccharide_bond(residue, parent, linkage)
        return residue, linkage

    def add_monosaccharide_bond(self, residue, parent, linkage):
        if parent is not None and linkage != ():
            if linkage.has_ambiguity:
                bond = AmbiguousLink(
                    parent, residue, parent_position=linkage.parent_position,
                    child_position=linkage.child_position,
                    parent_loss=Composition("H"), child_loss=Composition("OH"))
                bond.find_open_position()
            else:
                parent.add_monosaccharide(residue, position=linkage[1], child_position=linkage[0])

    def __call__(self, monosaccharide_str, parent=None):
        return self.monosaccharide_from_iupac(monosaccharide_str, parent=parent)

    def finalize(self, glycan):
        pass


[docs]class DerivatizationAwareMonosaccharideDeserializer(MonosaccharideDeserializer):
    _pattern = r'''(?:(?P<anomer>[abo?]|alpha|beta|\u03B1|\u03B2)-)?
                   (?P<configuration>[LD?])-
                   (?P<modification>[a-z0-9_\-,]*)
                   (?P<base_type>[^-]{3}?)
                   (?P<ring_type>[xpfo?])?
                   (?P<substituent>[^-]*?)
                   (?P<derivatization>\^[^\s-]*?)?
                   (?P<linkage>-\([0-9?/]+->?[0-9?/]+\)-?)?$'''
    try:
        # convert to unicode for Py2
        _pattern = _pattern.decode("raw_unicode_escape")
    except AttributeError:
        pass
    pattern = re.compile(_pattern, re.VERBOSE | re.UNICODE)

    def add_monosaccharide_bond(self, residue, parent, linkage):
        if parent is not None and linkage != ():
            try:
                if linkage.has_ambiguity:
                    bond = AmbiguousLink(
                        parent, residue, parent_position=linkage.parent_position,
                        child_position=linkage.child_position,
                        parent_loss=Composition("H"), child_loss=Composition("OH"))
                    bond.find_open_position()
                else:
                    parent.add_monosaccharide(residue, position=linkage[1], child_position=linkage[0])
            except ValueError:
                parent_substituent_links_at_site = parent.substituent_links[linkage[1]]
                if (parent_substituent_links_at_site and parent_substituent_links_at_site[0].child._derivatize):
                    parent.drop_substituent(linkage[1], parent_substituent_links_at_site[0].child)
                residue_substituent_links_at_site = residue.substituent_links[linkage[0]]
                if residue_substituent_links_at_site and residue_substituent_links_at_site[0].child._derivatize:
                    residue.drop_substituent(linkage[0], residue_substituent_links_at_site[0].child)

                if linkage.has_ambiguity:
                    bond = AmbiguousLink(
                        parent, residue, parent_position=linkage.parent_position,
                        child_position=linkage.child_position,
                        parent_loss=Composition("H"), child_loss=Composition("OH"))
                    bond.find_open_position()
                else:
                    parent.add_monosaccharide(residue, position=linkage[1], child_position=linkage[0])

    def apply_derivatization(self, residue, deriv):
        if deriv.startswith("^"):
            deriv = deriv[1:]
            deriv = self.substituent_parser.symbol_to_name(deriv)
            derivatize(residue, deriv)
        else:
            raise IUPACError("Derivatization Extension Must Start with '^'")

    def monosaccharide_from_iupac(self, monosaccharide_str, parent=None):
        match_dict = self.extract_pattern(monosaccharide_str)
        residue, linkage = self.build_residue(match_dict)
        linkage = self.parse_linkage_structure(linkage)

        self.add_monosaccharide_bond(residue, parent, linkage)

        deriv = match_dict.get("derivatization", '')
        if deriv is not None and deriv != "":
            self.apply_derivatization(residue, deriv)

        return residue, linkage

    def finalize(self, glycan):
        for node in glycan:
            neg_capacity = -node._remaining_capacity()
            if neg_capacity > 0:
                unknowns = node.substituent_links[UnknownPosition]
                to_remove = []
                for unknown in unknowns:
                    if unknown.child.node_type is Substituent.node_type and unknown.child._derivatize:
                        if neg_capacity > 0:
                            to_remove.append(unknown)
                            neg_capacity -= 1
                        else:
                            break
                for link_to_remove in to_remove:
                    link_to_remove.break_link(refund=True)
                if neg_capacity > 0:
                    raise ValueError("Could not completely remove overload from %s" % (node,))

    def strip_derivatization(self, residue_str, **kwargs):
        base = residue_str.rsplit("^")[0]
        return self(base, **kwargs)


[docs]class SimpleMonosaccharideDeserializer(DerivatizationAwareMonosaccharideDeserializer):
    pattern = re.compile(
        r'''(?P<modification>[a-z0-9_\-,]*)
            (?P<base_type>(?:[A-Z][a-z]{2}?|(?:[a-z]{3}[A-Z][a-z]{2})))
            (?P<ring_type>[pfox])?
            (?P<substituent>[^-]*?)
            (?P<derivatization>\^[^\s-]*?)?
            (?P<linkage>-?\((?P<anomer>[ab?o]?)[0-9?/]+->?[0-9?/]+\)-?)?$''', re.VERBOSE)

    linkage_parser = SimpleLinkageDeserializer()

    def parse_linkage_structure(self, linkage):
        return self.linkage_parser(linkage)

    def build_residue(self, match_dict):
        base_type = match_dict["base_type"]
        modification = (match_dict['modification'] or '').rstrip("-")

        try:
            residue = named_structures.monosaccharides[base_type]
        except KeyError:
            raise IUPACError("Unknown Residue Base-type %r" % (base_type,))
        base_is_modified = len(residue.substituent_links) + len(residue.modifications) > 0
        ring_type = match_dict.get('ring_type')
        if ring_type is not None:
            self.ring_bounds(residue, ring_type)
        self.set_modifications(residue, modification)
        self.set_substituents(residue, match_dict['substituent'], base_is_modified, base_type)
        linkage = match_dict.get("linkage")
        return residue, linkage


monosaccharide_from_iupac = MonosaccharideDeserializer()


[docs]class GlycanDeserializer(object):
    def __init__(self, monosaccharide_deserializer=None, set_default_positions=True):
        if monosaccharide_deserializer is None:
            monosaccharide_deserializer = MonosaccharideDeserializer()
        self.monosaccharide_deserializer = monosaccharide_deserializer
        self.set_default_positions = set_default_positions

    new_branch_open = re.compile(r"(\]-?)$")

    def add_monosaccharide(self, parent_node, child_node, linkage):
        # parent_node.add_monosaccharide(
        #     child_node, position=parent_position, child_position=child_position)
        self.monosaccharide_deserializer.add_monosaccharide_bond(
            child_node, parent_node, linkage)

    def glycan_from_iupac(self, text, structure_class=Glycan, **kwargs):
        last_outedge = None
        root = None
        last_residue = None
        branch_stack = []

        # Remove the base
        text = re.sub(r"\((\d*|\?)->?$", "", text)

        while len(text) > 0:
            # If starting a new branch
            match = self.new_branch_open.search(text)
            if match is not None:
                step = match.end(1) - match.start(1)
                text = text[:-step]
                branch_stack.append((last_residue, root, last_outedge))
                root = None
                last_residue = None
                last_outedge = None
            # If ending a branch
            elif text[-1] == '[':
                try:
                    branch_parent, old_root, old_last_outedge = branch_stack.pop()
                    # child_position, parent_position = last_outedge
                    self.add_monosaccharide(branch_parent, root, last_outedge)
                    root = old_root
                    last_residue = branch_parent
                    last_outedge = old_last_outedge
                    text = text[:-1]
                except IndexError:
                    raise IUPACError("Bad branching at {}".format(len(text)))
            # Parsing a residue
            else:
                match = self.monosaccharide_deserializer.has_pattern(text)
                if match:
                    next_residue, outedge = self.monosaccharide_deserializer(
                        text[match.start(): match.end()], last_residue)
                    if root is None:
                        last_outedge = outedge
                        root = next_residue
                    last_residue = next_residue
                    text = text[:match.start()]
                else:
                    raise IUPACError("Could not identify residue '...{}' at {}".format(text[-30:], len(text)))
        res = structure_class(root=root)
        self.monosaccharide_deserializer.finalize(res)
        res.reindex()
        if self.set_default_positions:
            self.set_default_positions_for_common_cases(res)
        return res

    def __call__(self, text, **kwargs):
        return self.glycan_from_iupac(text, **kwargs)

    def set_default_positions_for_common_cases(self, glycan):
        for node in glycan:
            candidates = []
            for pos, link in list(node.substituent_links.items()):
                if pos == UnknownPosition:
                    candidates.append(link)
            for candidate in candidates:
                substituent = candidate.to(node)
                position = None
                if substituent.name == 'n_acetyl':
                    if node.superclass == SuperClass.hex:
                        position = 2
                    elif node.superclass == SuperClass.non:
                        position = 5
                if position is None:
                    continue
                if not node.is_occupied(position):
                    candidate.break_link(refund=True)
                    candidate.parent_position = position
                    candidate.apply()
        return glycan


def set_default_positions(glycan):  # pragma: nocover
    for node in glycan:
        candidates = []
        for pos, link in list(node.substituent_links.items()):
            if pos == UnknownPosition:
                candidates.append(link)
        for candidate in candidates:
            substituent = candidate.to(node)
            position = None
            if substituent.name == 'n_acetyl':
                if node.superclass == SuperClass.hex:
                    position = 2
                elif node.superclass == SuperClass.non:
                    position = 5
            if position is None:
                continue
            if not node.is_occupied(position):
                candidate.break_link(refund=True)
                candidate.parent_position = position
                candidate.apply()
    return glycan


glycan_from_iupac = GlycanDeserializer()

glycan_from_iupac_simple = GlycanDeserializer(SimpleMonosaccharideDeserializer())


def from_iupac(text, structure_class=Glycan, resolve_default_positions=True, dialect=None, **kwargs):
    """Parse the given text into an instance of |Glycan|. If there is only a single monosaccharide
    in the output, just the Monosaccharide instance is returned.

    Parameters
    ----------
    text : |str|
        The text to parser
    resolve_default_positions: :class:`bool`
        Whether to assume default positions for common monosaccharide modifiers
        that are omitted for brevity, such as the postion of n-acetyl on HexNAc.
    dialect: :class:`str`
        One of "extended" or "simple", controlling whether the long-form linkage
        and monosaccharide notation is used, or the more compact simplified form
        is used. Defaults to "extended".
    **kwargs:
        Forwarded to :func:`glycan_from_iupac`

    Returns
    -------
    |Glycan| or |Monosaccharide|
        If the resulting structure is just a single monosaccharide, the returned value is a Monosaccharide.
    """
    if dialect is None:
        dialect = 'extended'
    if dialect != 'simple':
        res = glycan_from_iupac(
            text, structure_class=structure_class,
            set_default_positions=resolve_default_positions,
            **kwargs)
    else:
        res = glycan_from_iupac_simple(
            text, structure_class=structure_class,
            set_default_positions=resolve_default_positions,
            **kwargs)
    if len(res) > 1:
        return res
    else:
        return res.root


loads = from_iupac
dumps = to_iupac


class IUPACParser(ParserInterface):
    def process_result(self, line):
        structure = loads(line)
        return structure


load = IUPACParser.load


Monosaccharide.register_serializer("iupac", dumps)
Glycan.register_serializer("iupac", dumps)

_dumps_simple = partial(dumps, dialect='simple')
_dumps_extended = partial(dumps, dialect='extended')

Monosaccharide.register_serializer("iupac_simple", _dumps_simple)
Glycan.register_serializer("iupac_simple", _dumps_simple)

Monosaccharide.register_serializer("iupac_extended", _dumps_extended)
Glycan.register_serializer("iupac_extended", _dumps_extended)