Source code for glypy.io.wurcs.carbon_descriptors

# Much code here is derived from https://github.com/glycoinfo/glycocttowurcs
# though the representation of glycans in Eurocarbdb.MolecularFramework
# may not map 1:1.
try:
    from collections.abc import Sequence
except ImportError:
    from collections import Sequence

import warnings

from six import string_types as basestring

from .basetype_conversion import (
    descriptors_to_base_type)

from glypy.structure.monosaccharide import Monosaccharide, ReducedEnd
from glypy.structure.constants import SuperClass, Anomer, Modification, Stem, Configuration, UnknownPosition
from glypy import OrderedMultiMap


anomer_map = {
    Anomer.beta: 'b',
    Anomer.alpha: 'a',
    Anomer.uncyclized: 'o',
    Anomer.x: 'x'
}


[docs]class CarbonDescriptors(Sequence):
    def __init__(self, descriptors, anomer, anomeric_position, ring_start, ring_end):
        self.descriptors = tuple(descriptors)
        self.anomer = Anomer[anomer]
        self.anomeric_position = self._translate_position(anomeric_position)
        self.ring_start = ring_start if ring_start is not None else UnknownPosition
        self.ring_end = ring_end if ring_end is not None else UnknownPosition

    def _translate_position(self, position):
        if position == '?':
            position = -1
        elif position == -1:
            position = '?'
        else:
            position = int(position)
        return position

    def __len__(self):
        return len(self.descriptors)

    def __eq__(self, other):
        if other is None:
            return False
        if isinstance(other, basestring):
            return str(self) == other
        if self.descriptors != other.descriptors:
            return False
        elif self.anomer != other.anomer:
            return False
        elif self.anomeric_position != other.anomeric_position:
            return False
        elif self.ring_start != other.ring_start:
            return False
        elif self.ring_end != other.ring_end:
            return False
        return True

    def __ne__(self, other):
        return not self == other

    def __hash__(self):
        return hash(self.descriptors)

    def __getitem__(self, i):
        return self.descriptors[i]

    def __iter__(self):
        return iter(self.descriptors)

    def to_d_stereoform(self, code):
        out = []
        is_l_stereoform = code[-1] == '3'
        for site in code:
            if is_l_stereoform:
                if site == '3':
                    out.append('4')
                elif site == '4':
                    out.append('3')
                else:
                    out.append(site)
            else:
                out.append(site)
        return out

    def to_base_type(self):
        '''Convert the :class:`CarbonDescriptors` into a
        :class:`~.Monosaccharide`, not including substituents.

        Returns
        -------
        :class:`~.Monosaccharide`
        '''
        superclass = SuperClass[len(self)]
        carbon_coding = list(map(str, self))
        modifications = OrderedMultiMap()
        is_reduced = False
        # translate stereocode into generic carbon code
        for i, site in enumerate(carbon_coding):
            if site == '1':
                carbon_coding[i] = '3'
            elif site == '2':
                carbon_coding[i] = '4'
        start = 1
        stems = []
        configurations = []
        anomer = self.anomer
        ring_start = self.ring_start
        ring_end = self.ring_end
        if carbon_coding[0] == carbon_coding[-1] == 'h':
            anomer = Anomer.uncyclized
            ring_start = 0
            ring_end = 0
            is_reduced = True
        # if the stereosites are all defined
        if 'x' not in carbon_coding:
            # incrementally walk along the carbon sequence
            while start < superclass.value:
                # consider ring stereoforms of up to four carbons ahead, preferring longer
                # stereosequences,
                for i in range(4, 0, -1):
                    # extract the raw stereosequence
                    raw_chunk = carbon_coding[start:start + i]
                    # convert the stereosequence to D configuration and
                    # convert to a string for hash lookup
                    chunk = ''.join(self.to_d_stereoform(raw_chunk))
                    try:
                        # if the look up is successful
                        stem_name = descriptors_to_base_type[chunk]
                        # save the mapped stem name
                        stems.append(stem_name)
                        # infer the chirality of the ring from the last
                        # stereosite
                        conf = Configuration.x
                        if raw_chunk[-1] == '3':
                            conf = Configuration.l
                        elif raw_chunk[-1] == '4':
                            conf = Configuration.d
                        configurations.append(conf)
                        # start the lookup process again from the next starting
                        # location
                        start += len(raw_chunk)
                        break
                    except KeyError:
                        continue
                else:
                    # if no stereosequence could be detected, if the start position
                    # is a stereosite, then we may have a grolene trilose component
                    if chunk in ('3', '4'):
                        stems.append(descriptors_to_base_type['x'])
                        # infer the chirality of the ring from the last
                        # stereosite
                        conf = Configuration.x
                        if raw_chunk[-1] == '3':
                            conf = Configuration.l
                        elif raw_chunk[-1] == '4':
                            conf = Configuration.d
                        configurations.append(conf)
                    start += 1
        else:
            # This cannot handle unspecified nonulonic acids and other modified but unspecified
            # monosaccharides with multiple chiral centers well.
            stems.append(None)
            if carbon_coding[0] in ('u', 'h'):
                configurations.append(None)
            else:
                warnings.warn("Cannot infer chirality from %r" % (str(self),))
                configurations.append(None)
            # Guess if the monosaccharide is large enough to have a second chiral center, because
            # no other rule seems obvious. This could produce incorrect monosaccharide compositions?
            if len(carbon_coding) > 6:
                stems.append(None)
                configurations.append(None)

        anomeric_position = None
        double_bonds = []
        for i, site in enumerate(self):
            if site in ('a', 'u', 'U'):
                anomeric_position = i + 1
                if anomeric_position == 2:
                    modifications[anomeric_position] = Modification.keto
            if site in ('E', 'F'):
                double_bonds.append(i + 1)
            if site in ('d', 'm'):
                modifications[i + 1] = Modification.Deoxygenated
            if site == 'A':
                modifications[i + 1] = Modification.Acidic
        for site in double_bonds[::2]:
            modifications[i] = Modification.en
        stems = [Stem[x] for x in stems[::-1]]
        configurations = configurations[::-1]

        base = Monosaccharide(
            anomer,
            configurations,
            stems,
            superclass,
            ring_start,
            ring_end,
            modifications, reduced=ReducedEnd() if is_reduced else None)
        return base

    @classmethod
    def from_monosaccharide(cls, monosaccharide):
        '''Create a :class:`CarbonDescriptors` from a given
        :class:`~.Monosaccharide`.

        Parameters
        ----------
        monosaccharide: :class:`~.Monosaccharide`
            The monosaccharide to describe

        Returns
        -------
        :class:`CarbonDescriptors`
        '''
        code = ['x'] * monosaccharide.superclass.value
        stereocode = monosaccharide.stereocode
        code = [str(x.value) if x.value is not None else 'x' for x in stereocode]
        code[0] = 'u'
        code[-1] = 'h'
        if monosaccharide.anomer == 'uncyclized':
            code[0] = 'h'
            code[-1] = 'h'
        anomer = monosaccharide.anomer
        anomeric_position = monosaccharide.ring_start
        anomeric_sites = []
        is_aldose = True
        # encode the modifications onto the carbon descriptor code
        for position, modification in monosaccharide.modifications.items():
            is_terminal = (position == 1 or position == monosaccharide.superclass.value)
            if modification == Modification.Acidic:
                if not is_terminal:
                    raise ValueError("Cannot add a carboxylic acid group to a non-terminal carbon")
                if position == 1:
                    is_aldose = False
                code[position - 1] = 'A'
            elif modification == Modification.Deoxygenated:
                if position == 1:
                    is_aldose = False
                if is_terminal:
                    code[position - 1] = 'm'
                else:
                    code[position - 1] = 'd'
            elif modification == Modification.Ketone:
                is_aldose = False
                # code[position] = 'o'
                anomeric_sites.append(position)
            elif modification == Modification.en:
                code[position - 1] = 'E'
            elif modification == Modification.Alditol:
                is_aldose = False
                if position != 1:
                    raise ValueError("\"aldi\" must occur on the first carbon")
        if is_aldose:
            anomeric_sites.append(1)
        anomeric_position = anomeric_sites[0]
        # if the anomeric position is fully defined and the monosaccharide is cyclic
        if monosaccharide.ring_start not in (UnknownPosition, 0):
            code[anomeric_position - 1] = 'a'
        # if the anomeric position is partially undefined, the carbon code is 'u'
        elif monosaccharide.ring_start == UnknownPosition:
            code[anomeric_position - 1] = 'u'
        if monosaccharide.ring_start == UnknownPosition:
            anomeric_position = "?"
        return cls(code, anomer, anomeric_position, monosaccharide.ring_start, monosaccharide.ring_end)

    def to_backbone_code(self):
        '''Convert :class:`CarbonDescriptors` into a string representation
        matching the ``<BackboneCode>`` pattern from WURCS2.0

        Returns
        -------
        :class:`str`
        '''
        parts = []
        # carbon descriptors
        parts.append(''.join([i for i in self]))
        # if the anomer is completely undefined, do not include it
        if not (self.anomeric_position == -1 and self.anomer == Anomer.x):
            parts.append("-%s%s" % (self._translate_position(self.anomeric_position),
                                    anomer_map[self.anomer]))
        # if the ring is neither undefined nor open, include it
        if (self.ring_start != -1 and self.ring_end != -1) and (self.ring_start != 0 and self.ring_end != 0):
            parts.append("_%s-%s" % tuple(map(self._translate_position, (self.ring_start, self.ring_end))))
        return ''.join(parts)

    def __str__(self):
        return self.to_backbone_code()

    def __repr__(self):
        descriptors = ''.join(map(str, self))
        template = ("{self.__class__.__name__}({descriptors!r}, {self.anomer.name!r}, "
                    "{self.anomeric_position}, {self.ring_start}, {self.ring_end})")
        return template.format(self=self, descriptors=descriptors)