Source code for glypy.algorithms.similarity

import operator
from collections import defaultdict
import functools

from six import string_types as basestring

from glypy import Substituent, monosaccharides
from glypy.structure.constants import Modification, Stem, UnknownPosition


class NodeSimilarityComparator(object):
    '''A heuristic comparison for measuring similarity between monosaccharides.

    Compares:
        1. ring_start and ring_end
        2. superclass
        3. configuration
        4. stem
        5. anomer
        6. If `include_modifications`, each modification
        7. If `include_substituents`, each substituent
        8. If `include_children`, each child |Monosaccharide|

    Attributes
    ----------
    include_substituents: bool
        Include substituents in comparison (Defaults |True|)
    include_modifications: bool
        Include modifications in comparison (Defaults |True|)
    include_children: bool
        Include children in comparison (Defaults |False|)
    exact: bool
        Penalize for having unmatched attachments (Defaults |True|)
    ignore_reduction: bool
        Whether or not to include differences in reduction state as a
        mismatch
    ignore_ring: bool
        Whether or not to include differences in ring coordinates as
        a mismatch
    treat_null_as_wild: bool
        Whether or not to treat traits with a value of :const:`None` or
        :const:`~.UnknownPosition` as always matching when the null
        value is on the *target* residue (the residue that traits are being
        matched to).
    short_circuit_after: None or Number
        Controls whether to quit comparing nodes if the difference
        becomes too large, useful for speeding up pessimistic
        comparisons
    visited: set
        Tracks which node pairs have already been compared to break
        cycles. This carries state across multiple calls to :meth:`compare`
        and must be reset by calling :meth:`reset` before reusing an
        instance on new structures.
    '''
    def __init__(self, include_substituents=True, include_modifications=True,
                 include_children=False, exact=True, ignore_reduction=False,
                 ignore_ring=False, treat_null_as_wild=True,
                 match_attachement_positions=False, short_circuit_after=None,
                 visited=None):
        if visited is None:
            visited = set()
        self.include_substituents = include_substituents
        self.include_modifications = include_modifications
        self.include_children = include_children
        self.exact = exact
        self.ignore_ring = ignore_ring
        self.ignore_reduction = ignore_reduction
        self.treat_null_as_wild = treat_null_as_wild
        self.match_attachement_positions = match_attachement_positions
        self.visited = visited
        self.short_circuit_after = short_circuit_after

    def reset(self):
        self.visited.clear()

    @classmethod
    def similarity(cls, node, target, include_substituents=True,
                   include_modifications=True, include_children=False,
                   exact=True, ignore_reduction=False, ignore_ring=False,
                   treat_null_as_wild=True, match_attachement_positions=False,
                   short_circuit_after=None, visited=None):
        """A heuristic comparison for measuring similarity between monosaccharides.

        Compares:
            1. ring_start and ring_end
            2. superclass
            3. configuration
            4. stem
            5. anomer
            6. If `include_modifications`, each modification
            7. If `include_substituents`, each substituent
            8. If `include_children`, each child |Monosaccharide|

        The result is two numbers, the observed similarity between `node` and `target`,
        and the similarity between `target` and itself. `expected - observed` is there
        number of differences observed between the two monosaccharides, which can be useful
        for expressing how far apart two monosaccharides are in feature space. For more distant
        similarity testing, especially when considering children, the ratio `observed / expected`
        might be used instead.

        Similarity is not symmetric, e.g. ``a -> b != b -> a``. A commutative version of similarity
        can be used by calculating both directions, and taking the result with the smallest error.

        Parameters
        ----------
        node : :class:`~.Monosaccharide`
            The reference monosaccharide
        target : :class:`~.Monosaccharide`
            The monosaccharide to compare against
        include_substituents: bool
            Include substituents in comparison (Defaults |True|)
        include_modifications: bool
            Include modifications in comparison (Defaults |True|)
        include_children: bool
            Include children in comparison (Defaults |False|)
        exact: bool
            Penalize for having unmatched attachments (Defaults |True|)
        ignore_reduction: bool
            Whether or not to include differences in reduction state as a
            mismatch
        ignore_ring: bool
            Whether or not to include differences in ring coordinates as
            a mismatch
        treat_null_as_wild: bool
            Whether or not to treat traits with a value of :const:`None` or
            :const:`~.UnknownPosition` as always matching when the null
            value is on the *target* residue (the residue that traits are being
            matched to).
        short_circuit_after: None or Number
            Controls whether to quit comparing nodes if the difference
            becomes too large, useful for speeding up pessimistic
            comparisons
        visited: set
            Tracks which node pairs have already been compared to break
            cycles. This carries state across multiple calls to :meth:`compare`
            and must be reset by calling :meth:`reset` before reusing an
            instance on new structures.

        Returns
        -------
        :class:`int`: observed
            The number of observed features that matched
        :class:`int`: expected
            The number of features that could have been matched
        """
        inst = cls(
            include_substituents=include_substituents,
            include_modifications=include_modifications,
            include_children=include_children,
            exact=exact, ignore_reduction=ignore_reduction,
            ignore_ring=ignore_ring, treat_null_as_wild=treat_null_as_wild,
            match_attachement_positions=match_attachement_positions,
            short_circuit_after=short_circuit_after,
            visited=visited)
        return inst.compare(node, target)

    def compare_anomer(self, node, target):
        """Compare :attr:`~.Monosaccharide.anomer` of `node` and `target`

        Parameters
        ----------
        node : :class:`~.Monosaccharide`
            The reference monosaccharide
        target : :class:`~.Monosaccharide`
            The monosaccharide to compare against

        Returns
        -------
        test: int
            The similarity between node and target
        reference: int
            The expected similarity between target and itself
        """
        test = (node.anomer == target.anomer) or ((target.anomer.value is None) and self.treat_null_as_wild)
        reference = 1
        return int(test), reference

    def compare_compositions(self, node, target):
        """Compre :meth:`~.Monosaccharide.total_composition` of `node` and `target`.

        Parameters
        ----------
        node : :class:`~.Monosaccharide`
            The reference monosaccharide
        target : :class:`~.Monosaccharide`
            The monosaccharide to compare against

        Returns
        -------
        test: int
            The similarity between node and target
        reference: int
            The expected similarity between target and itself
        """
        test = int(node.total_composition() == target.total_composition())
        reference = 1
        return test, reference

    def compare_ring_structure(self, node, target):
        """Compare the :attr:`~.Monosaccharide.superclass`, :attr:`~.Monosaccharide.stem`,
        :attr:`~.Monosaccharide.configuration`, and possibly :attr:`~.Monosaccharide.ring_start`
        and :attr:`~.Monosaccharide.ring_end` of `node` and `target`.

        If :attr:`ignore_ring` is :const:`True`, ring positions will be assumed to match.

        Parameters
        ----------
        node : :class:`~.Monosaccharide`
            The reference monosaccharide
        target : :class:`~.Monosaccharide`
            The monosaccharide to compare against

        Returns
        -------
        test: int
            The similarity between node and target
        reference: int
            The expected similarity between target and itself
        """
        test = reference = 0
        test += (node.superclass == target.superclass) or ((target.superclass.value is None) and
                                                           self.treat_null_as_wild)
        reference += 1
        test += (node.stem == target.stem) or ((target.stem[0].value is None) and self.treat_null_as_wild)
        reference += 1
        test += (node.configuration == target.configuration) or ((target.configuration[0].value is None) and
                                                                 self.treat_null_as_wild)
        reference += 1
        if not self.ignore_ring:
            test += (node.ring_start == target.ring_start) or ((target.ring_start == UnknownPosition) and
                                                               self.treat_null_as_wild)
            reference += 1
            test += (node.ring_end == target.ring_end) or ((target.ring_end == UnknownPosition) and
                                                           self.treat_null_as_wild)
            reference += 1
        else:
            test += 2
            reference += 2
        return test, reference

    def compare_modifications(self, node, target):
        """Compare the modifications of `node` and `target`

        Parameters
        ----------
        node : :class:`~.Monosaccharide`
            The reference monosaccharide
        target : :class:`~.Monosaccharide`
            The monosaccharide to compare against

        Returns
        -------
        test: int
            The similarity between node and target
        reference: int
            The expected similarity between target and itself
        """
        test = reference = 0
        node_reduced = False
        target_reduced = False
        n_mods = 0
        if self.match_attachement_positions:
            node_mods = node.modifications
            n_mods = len(node_mods)
            for pos, mod in target.modifications.items():
                if mod == 'aldi':
                    target_reduced = True
                check = (mod in node_mods[pos])
                if check:
                    if mod == 'aldi':
                        node_reduced = True
                    test += 1
                    n_mods -= 1
                reference += 1
        else:
            node_mods = list(node.modifications.values())
            n_mods = len(node_mods)
            for mod in target.modifications.values():
                if mod == 'aldi':
                    target_reduced = True
                check = (mod in node_mods)
                if check:
                    if mod == 'aldi':
                        node_reduced = True
                    test += 1
                    node_mods.pop(node_mods.index(mod))
                    n_mods -= 1
                reference += 1

        if self.ignore_reduction:
            if target_reduced:
                reference -= 1
            if node_reduced:
                test -= 1
        reference += n_mods if self.exact else 0
        return test, reference

    def compare_substituents(self, node, target):
        """Compare the substituents of `node` and `target`.

        If :attr:`match_attachement_positions` is :const:`True`,
        the positions must match exactly, otherwise only the substituent
        types must match.

        Parameters
        ----------
        node : :class:`~.Monosaccharide`
            The reference monosaccharide
        target : :class:`~.Monosaccharide`
            The monosaccharide to compare against

        Returns
        -------
        test: int
            The similarity between node and target
        reference: int
            The expected similarity between target and itself
        """
        test = reference = 0
        if self.match_attachement_positions:
            node_subs = defaultdict(list)
            n_subs = 0
            for p, sub in node.substituents():
                n_subs += 1
                node_subs[p].append(sub)
            for pos, sub in target.substituents():
                if (sub in node_subs[pos]):
                    test += 1
                    n_subs -= 1
                reference += 1
        else:
            node_subs = [node_sub for p, node_sub in node.substituents()]
            n_subs = len(node_subs)
            for pos, sub in target.substituents():
                if (sub in node_subs):
                    test += 1
                    node_subs.pop(node_subs.index(sub))
                    n_subs -= 1
                reference += 1
        reference += n_subs if self.exact else 0
        return test, reference

    def _build_child_pair_score_map(self, node, target):
        '''Compute all pair-wise similarities between children
        of ``node`` and ``target``

        Parameters
        ----------
        node : :class:`~.Monosaccharide`
            The reference monosaccharide
        target : :class:`~.Monosaccharide`
            The monosaccharide to compare against

        Returns
        -------
        dict
        '''
        # TODO: support exactness penalty here, maybe recursively?
        node_children = list(child for p, child in node.children())
        match_index = dict()
        for p, target_child in target.children():
            for node_child in node_children:
                c_res, c_qs = self.compare(node_child, target_child)
                match_index[node_child.id, target_child.id] = (c_res, c_qs)
        return match_index

    def compare_children(self, node, target):
        """Compute the similarity of the set of children between `node` and `target`

        If :attr:`match_attachement_positions` is :const:`True`, this will require
        the positions of child nodes to match exactly, otherwise, all pair-wise combinations
        will be considered and the optimal solution will be selected using :meth:`optimal_assignment`

        Parameters
        ----------
        node : :class:`~.Monosaccharide`
            The reference monosaccharide
        target : :class:`~.Monosaccharide`
            The monosaccharide to compare against

        Returns
        -------
        test: int
            The similarity between node and target
        reference: int
            The expected similarity between target and itself
        """
        test = reference = 0
        if self.match_attachement_positions:
            node_children = defaultdict(list)
            for pos, child in node.children():
                node_children[pos].append(child)
            target_children = defaultdict(list)
            for pos, child in target.children():
                target_children[pos].append(child)
            for pos, children in target_children.items():
                for t_child, n_child in zip(children, node_children[pos]):
                    test_child, reference_child = self.compare(t_child, n_child)
                    test += test_child
                    reference += reference_child
        else:
            match_index = self._build_child_pair_score_map(node, target)
            assignments = self.optimal_assignment(match_index)
            for ix in assignments:
                a_test, a_reference = match_index[ix]
                test += a_test
                reference += a_reference
        return test, reference

    def _check_short_circuit(self, test, reference):
        return self.short_circuit_after is not None and (test - reference) < self.short_circuit_after

    def compare(self, node, target):
        """Calculate the similarity between `node` and `target`.

        This method does most of the organizational work, calling the appropriate
        methods and checking for short-circuiting.

        Parameters
        ----------
        node : :class:`~.Monosaccharide`
            The reference monosaccharide
        target : :class:`~.Monosaccharide`
            The monosaccharide to compare against

        Returns
        -------
        test: int
            The similarity between node and target
        reference: int
            The expected similarity between target and itself
        """
        key = (node.id, target.id)
        if key in self.visited:
            return 0, 0
        self.visited.add(key)
        test = 0
        reference = 0
        try:
            t, r = self.compare_anomer(node, target)
            test += t
            reference += r
        except AttributeError:
            # must be handling substituents
            t, r = self.compare_compositions(node, target)
            test += t
            reference += r
            return test, reference
        t, r = self.compare_ring_structure(node, target)
        test += t
        reference += r
        if self._check_short_circuit(test, reference):
            return test, reference
        if self.include_modifications:
            t, r = self.compare_modifications(node, target)
            test += t
            reference += r
            if self._check_short_circuit(test, reference):
                return test, reference
        if self.include_substituents:
            t, r = self.compare_substituents(node, target)
            test += t
            reference += r
            if self._check_short_circuit(test, reference):
                return test, reference
        if self.include_children:
            t, r = self.compare_children(node, target)
            test += t
            reference += r
        return test, reference

    def optimal_assignment(self, assignments):
        '''
        Given a set of possibly overlapping matches, find the
        optimal solution.
        '''
        diff_map = dict()
        for ids in assignments:
            diff_map[ids] = operator.sub(*assignments[ids])

        index_pair_sets = self.build_unique_index_pairs(assignments)

        def ordering_fn(index_pairs):
            total = 0
            for ix in index_pairs:
                total += max(assignments[ix])
            return total

        ordered_index_pairs_sets = sorted(index_pair_sets, key=ordering_fn, reverse=True)

        best_score = -float('inf')
        best_mapping = {}
        # prefer solutions which have a higher maximum value (more points of comparison)
        for assignment in ordered_index_pairs_sets:
            current_score = 0
            for ix in assignment:
                current_score += diff_map[ix]
            if current_score > best_score:
                best_score = current_score
                best_mapping = assignment
        return best_mapping

    def build_unique_index_pairs(self, pairs):
        '''
        Generate all unique non-overlapping sets of pairs, given in
        `pairs`
        '''
        depth = 0
        pairings = defaultdict(set)
        for a, b in pairs:
            pairings[a].add(b)
        next_current = [()]
        options_a = list(pairings)
        partial_solutions = set()
        while depth < len(options_a):
            for current in next_current:
                if len(current) > 0:
                    current_a, current_b = map(set, zip(*current))
                else:
                    current_b = set()
                components = set()
                a = options_a[depth]
                for b in pairings[a] - current_b:
                    components.add((current + ((a, b),)))
                partial_solutions.update(components)
            depth += 1
            next_current = partial_solutions
            partial_solutions = set()
        return list(next_current)


monosaccharide_similarity = NodeSimilarityComparator.similarity


[docs]def commutative_similarity(node, target, tolerance=0, *args, **kwargs):
    """Apply :func:`monosaccharide_similarity` to ``node`` and ``target`` for both
    ``node --> target`` and ``target --> node``, returning whether either comparison
    passes the tolerance threshold.

    Parameters
    ----------
    node: :class:`~.Monosaccharide`
        The reference monosaccharide
    target: :class:`~.Monosaccharide`
        The monosaccharide to compare against
    tolerance: :class:`int`, optional
        The minimum number of errors to tolerate
    *args:
        Forwarded to :func:`monosaccharide_similarity`
    **kwargs:
        Forwarded to :func:`monosaccharide_similarity`

    Returns
    -------
    :class:`bool`
    """
    obs, expect = monosaccharide_similarity(node, target, *args, **kwargs)
    if (expect - obs) <= tolerance:
        return True
    else:
        obs, expect = monosaccharide_similarity(target, node, *args, **kwargs)
        return (expect - obs) <= tolerance


[docs]def commutative_similarity_score(node, target, *args, **kwargs):
    """Apply :func:`monosaccharide_similarity` to ``node`` and ``target`` for both
    ``node --> target`` and ``target --> node``, returning the maximally normalized
    ratio of `observed / expected`.

    Parameters
    ----------
    node: :class:`~.Monosaccharide`
        The reference monosaccharide
    target: :class:`~.Monosaccharide`
        The monosaccharide to compare against
    *args:
        Forwarded to :func:`monosaccharide_similarity`
    **kwargs:
        Forwarded to :func:`monosaccharide_similarity`

    Returns
    -------
    :class:`float`:
        The maximal similarity score ratio
    """
    a_b, b_b = monosaccharide_similarity(node, target, *args, **kwargs)
    b_a, a_a = monosaccharide_similarity(target, node, *args, **kwargs)
    return max(a_b / (1. * b_b), b_a / (1. * a_a))


[docs]def commutative_similarity_score_with_tolerance(node, target, tolerance, *args, **kwargs):
    """Apply :func:`monosaccharide_similarity` to ``node`` and ``target`` for both
    ``node --> target`` and ``target --> node``, returning the maximally normalized
    ratio score, and whether there was a pair error less than `tolerance`.

    This can be viewed as a combination of :func:`commutative_similarity` and
    :func:`commutative_similarity_score` while making fewer calls to
    :func:`monosaccharide_similarity`.

    Parameters
    ----------
    node: :class:`~.Monosaccharide`
        The reference monosaccharide
    target: :class:`~.Monosaccharide`
        The monosaccharide to compare against
    tolerance: :class:`int`
        The minimum number of errors to tolerate
    *args:
        Forwarded to :func:`monosaccharide_similarity`
    **kwargs:
        Forwarded to :func:`monosaccharide_similarity`

    Returns
    -------
    :class:`float`:
        The maximal similarity score ratio
    :class:`bool`:
        Whether the difference passes error tolerance
    """
    a_b, b_b = monosaccharide_similarity(node, target, *args, **kwargs)
    b_a, a_a = monosaccharide_similarity(target, node, *args, **kwargs)
    pass_threshold = False
    if (b_b - a_b) <= tolerance or (a_a - b_a) <= tolerance:
        pass_threshold = True
    return max(a_b / (1. * b_b), b_a / (1. * a_a)), pass_threshold



[docs]def has_substituent(monosaccharide, substituent):
    """Checks whether ``monosaccharide`` has any substituent groups
    matching ``substituent``.

    Parameters
    ----------
    monosaccharide : :class:`~.Monosaccharide`
        The monosaccharide to check
    substituent : :class:`~.Substituent` or :class:`str`
        The substituent to check for

    Returns
    -------
    :class:`bool`
    """
    # Use the setter property to force the translation
    # of the name string.
    if isinstance(substituent, basestring):
        substituent = Substituent(substituent)
    substituent = substituent._name
    for position, subst in monosaccharide.substituents():
        if substituent == subst._name:
            return True
    return False


[docs]def has_modification(monosaccharide, modification):
    """Checks whether ``monosaccharide`` has any modification sites
    matching ``modification``.

    Parameters
    ----------
    monosaccharide : :class:`~.Monosaccharide`
        The monosaccharide to check
    modification : :class:`~.constants.Modification` or :class:`str`
        The modification to check for

    Returns
    -------
    :class:`bool`
    """
    for position, mod in monosaccharide.modifications.items():
        if mod == modification:
            return True
    return False


[docs]def has_monosaccharide(glycan, monosaccharide, tolerance=0, *args, **kwargs):
    """Checks whether ``glycan`` has any monosaccharide nodes
    matching ``monosaccharide`` within ``tolerance`` using
    :func:`commutative_similarity`

    Parameters
    ----------
    glycan: :class:`~.SaccharideCollection`
        The glycan structure or composition to search
    monosaccharide: :class:`~.Monosaccharide`
        The monosaccharide to search for
    tolerance : int, optional
        The error tolerance to use
    *args:
        Forwarded to :func:`monosaccharide_similarity`
    **kwargs:
        Forwarded to :func:`monosaccharide_similarity`

    Returns
    -------
    :class:`bool`
    """
    if isinstance(monosaccharide, basestring):
        monosaccharide = monosaccharides[monosaccharide]
    visited = set()
    for node in glycan:
        if commutative_similarity(
                node, monosaccharide, tolerance=tolerance, visited=visited, *args, **kwargs):
            return node
    return False


[docs]def is_reduced(obj):
    """A simple predicate to test whether an object has a reduced structure.

     If `obj` does not have a `reducing_end` attribute, this will return :const:`False`

    Parameters
    ----------
    obj : object
        The object to check

    Returns
    -------
    bool
    """
    try:
        return obj.reducing_end is not None
    except AttributeError:
        return False


[docs]def is_amine(substituent):
    """A simple predicate to test whether a substituent has an amine group adjacent
    attached to the carbon backbone by naming convention.

    This predicate checks to see if the name of the substituent is "amino" or if it
    starts with the phrase "n_" only.

    Parameters
    ----------
    substituent : Substituent  or str
        The object to test

    Returns
    -------
    bool
    """
    if isinstance(substituent, Substituent):
        name = substituent.name
    else:
        name = substituent
    return name.startswith("n_") or name == "amino"


[docs]def is_aminated(monosaccharide):
    """Tests to see if any substituents of `monosaccharide` are amines.

    Each substituent is tested using :func:`is_amine`, with all the caveats that
    entails.

    Parameters
    ----------
    monosaccharide : Monosaccharide
        The monosaccharide to test

    Returns
    -------
    bool

    See Also
    --------
    is_amine
    """
    for p, substituent in monosaccharide.substituents():
        if is_amine(substituent):
            return True
    return False


has_fucose = functools.partial(has_monosaccharide, monosaccharide=monosaccharides["Fucose"])
has_n_acetyl = functools.partial(has_substituent, substituent=Substituent("n-acetyl"))
is_acidic = functools.partial(has_modification, modification=Modification.Acidic)
is_sulfated = functools.partial(has_substituent, substituent=Substituent("sulfate"))


[docs]def is_generic_monosaccharide(monosaccharide):
    """Tests if the :attr:`~.Monosaccharide.stem` is unknown.

    Parameters
    ----------
    monosaccharide : Monosaccharide
        The object to test

    Returns
    -------
    bool
    """
    return monosaccharide.stem[0] is Stem.x


[docs]def is_derivatized(monosaccharide):
    """Tests whether any of the substituents attached to `monosaccharide` were
    added by derivatization.

    Parameters
    ----------
    monosaccharide : Monosaccharide
        The object to test

    Returns
    -------
    bool
    """
    for pos, sub in monosaccharide.substituents():
        if sub._derivatize:
            return True
    return False