Source code for glypy.io.cfg

'''
CFG Format
----------

An experimental parser for the `Consortium for Functional Glycomics <http://www.functionalglycomics.org/>`_
(CFG) glycan line format.

'''
import re
import warnings

from collections import deque, namedtuple
from functools import partial

from glypy.structure import (
    Monosaccharide, Glycan, Link, AmbiguousLink,
    Substituent, constants, named_structures,
    UnknownPosition, SuperClass, Modification)
from glypy.composition import Composition
from glypy.composition.structure_composition import substituent_compositions
from glypy.composition.composition_transform import has_derivatization, derivatize
from glypy.io import format_constants_map
from glypy.io.nomenclature import identity
from glypy.utils import invert_dict

from glypy.io.file_utils import ParserInterface, ParserError


# A static copy of monosaccharide names to structures for copy-free comparison
monosaccharide_reference = {k: v for k, v in named_structures.monosaccharides.items()}

anomer_map_from = dict(format_constants_map.anomer_map)
anomer_map_from['?'] = anomer_map_from.pop('x')
anomer_map_to = invert_dict(anomer_map_from)
anomer_map_from['beta'] = anomer_map_from['b']
anomer_map_from['alpha'] = anomer_map_from['a']
anomer_map_from[u"\u03B1"] = anomer_map_from['a']
anomer_map_from[u"\u03B2"] = anomer_map_from['b']

def _make_substituent_name(name):
    return ''.join(t.title() for t in name.split("_")).replace("(", "").replace(")", "")


substituents_map_to = {
    name: _make_substituent_name(name) for name in substituent_compositions
}

# Special Cases
substituents_map_to['n_acetyl'] = "NAc"
substituents_map_to['n_glycolyl'] = "NGc"
substituents_map_to['n_sulfate'] = "NS"
substituents_map_to['sulfate'] = "S"
substituents_map_to["methyl"] = "Me"
substituents_map_to["acetyl"] = "Ac"
substituents_map_to["glycolyl"] = "Gc"
substituents_map_to["fluoro"] = "F"
substituents_map_to["amino"] = "N"
substituents_map_to['phosphate'] = 'P'

substituents_map_from = invert_dict(substituents_map_to)
substituents_map_from['Phosphate'] = 'phosphate'


[docs]class CFGError(ParserError): pass
LinkageSpecification = namedtuple("LinkageSpecification", ("child_position", "parent_position", "has_ambiguity")) class LinkageDeserializer(object): pattern = re.compile(r"(?P<child_linkage>[0-9?/]+)->?(?P<parent_linkage>[0-9?/]+)?") def parse(self, linkage_string): if linkage_string is None: return None match = self.pattern.search(linkage_string) if match is None: raise CFGError(linkage_string) has_ambiguity = "/" in linkage_string match_groups = match.groupdict() child_linkage = match_groups['child_linkage'] if child_linkage is not None: child_linkage = self.parse_position(child_linkage) parent_linkage = match_groups['parent_linkage'] if parent_linkage is not None: parent_linkage = self.parse_position(parent_linkage) return LinkageSpecification(child_linkage, parent_linkage, has_ambiguity) def parse_position(self, position_string): if position_string == '?': return UnknownPosition elif '/' in position_string: return list(map(int, position_string.split('/'))) else: return int(position_string) def __call__(self, linkage_string): return self.parse(linkage_string) class SubstituentDeserializer(object): def __init__(self, error_on_missing=True): self.error_on_missing = error_on_missing def substituent_from_cfg(self, tokens): for position, name in tokens: if position is None: position = UnknownPosition else: try: position = int(position) except (ValueError, TypeError): warnings.warn("Unable to interpret substituent position %r" % (position)) position = UnknownPosition try: name = (substituents_map_from[name]) except KeyError: # Acidic special case: # Often, acidic monosaccharides are written with a trailing A like a substituent while # GlycoCT treats acidic groups as modifications. If an A appears in the substituent suffix # it will fail to be cast as a substituent, but pass it along raw and it will be handled # downstream by :func:`monosaccharide_from_iupac`. if name == "A": pass else: # pragma: no cover if not self.error_on_missing: warnings.warn("No translation rule found to convert %s into a Substituent" % name) continue else: raise CFGError("No translation rule found to convert %s into a Substituent" % name) yield int(position), name def symbol_to_name(self, symbol): name = substituents_map_from[symbol] return name def __call__(self, substituents): return self.substituent_from_cfg(substituents) def aminate_substituent(substituent): if substituent.name.startswith("n_"): # already aminated return substituent aminated = Substituent("n_" + substituent.name) if aminated.composition == {}: raise ValueError("Could not aminate substituent") return aminated class MonosaccharideDeserializer(object): _pattern = r""" (:?\((?P<substituent_prefix_position>[0-9\?]+?) (?P<substituent_prefix_name>[A-Za-z]+?)\))? (?P<base_type>(?:[A-Z][a-z]{2}?|(:?[a-z]{3}[A-Z][a-z]{2}))) (?:(?P<substituent_position>\d+?)?(?P<substituent_name>[A-Za-z]+?))? (?P<anomer>a|b|\?|alpha|beta|\u03B1|\u03B2) (?P<linkage>[0-9?/]+(:?->?|,)[0-9?/]+?)? $""" try: # convert to unicode for Py2 _pattern = _pattern.decode("raw_unicode_escape") except AttributeError: pass pattern = re.compile(_pattern, re.VERBOSE | re.UNICODE) def __init__(self, substituent_deserializer=None): if substituent_deserializer is None: substituent_deserializer = SubstituentDeserializer() self.substituent_deserializer = substituent_deserializer self.linkage_parser = LinkageDeserializer() def has_pattern(self, string): return self.pattern.search(string) def extract_pattern(self, monosaccharide_str): match = self.pattern.search(monosaccharide_str) if match is None: raise CFGError("Cannot find monosaccharide pattern in {}".format(monosaccharide_str)) match_dict = match.groupdict() return match_dict def build_residue(self, match_dict): try: anomer = anomer_map_from[match_dict['anomer']] except KeyError: anomer = anomer_map_from['?'] base_type = match_dict["base_type"] linkage = match_dict.get("linkage") original_base_type = base_type try: residue = named_structures.monosaccharides[base_type] except KeyError: raise CFGError("Unknown Residue Base-type %r" % (original_base_type,)) base_is_modified = len(residue.substituent_links) + len(residue.modifications) > 0 residue.anomer = anomer substituents = [] if match_dict['substituent_name'] is not None: substituents.append((match_dict['substituent_position'], match_dict['substituent_name'])) if match_dict['substituent_prefix_name'] is not None: substituents.append(( match_dict['substituent_prefix_position'], match_dict['substituent_prefix_name'])) self.set_substituents(residue, substituents, base_is_modified, base_type) return residue, linkage def set_substituents(self, residue, substituent_set, base_is_modified, base_type): i = 0 for position, substituent in self.substituent_deserializer(substituent_set): i += 1 if position == UnknownPosition and base_is_modified: # Guess at what the user might mean using base_type if base_type == "Neu" and substituent in ["acetyl", "glycolyl"] and i == 1: position = 5 else: raise ValueError( "Cannot have ambiguous location of substituents on a base type which" " has default modifications or substituents. {} {}".format( residue, (position, substituent))) # Often, acidic monosaccharides will be suffixed "A" instead of prefixed "a". # Handle this here. if substituent == "A": residue.add_modification(Modification.a, position) continue substituent = Substituent(substituent) try: residue.add_substituent( substituent, position, parent_loss=substituent.attachment_composition_loss(), child_loss='H') except ValueError: # Highly modified large bases have a degenerate encoding, where additional qualifications following # base name *replace* an existing substituent. This behavior may not be expected in other more # common cases. if base_type in {"Neu", "Kdo"}: occupancy = 0 try: unplaced = residue.substituent_links[position][0].child residue.drop_substituent(position) if unplaced.name == "amino": try: substituent = aminate_substituent(substituent) except ValueError: pass except ValueError: # The site contains a modification which can be present alongside the substituent occupancy = 1 except IndexError: occupancy = 1 try: residue.add_substituent( substituent, position, occupancy, parent_loss=substituent.attachment_composition_loss(), child_loss='H') except ValueError: raise CFGError("Can't resolve %s" % substituent) else: raise def add_monosaccharide_bond(self, residue, parent, linkage): if parent is not None and linkage != (): parent.add_monosaccharide(residue, position=linkage[1], child_position=linkage[0]) def monosaccharide_from_cfg(self, monosaccharide_str, parent=None): match_dict = self.extract_pattern(monosaccharide_str) residue, linkage = self.build_residue(match_dict) linkage = self.linkage_parser(linkage) self.add_monosaccharide_bond(residue, parent, linkage) return residue, linkage def __call__(self, cfg_str, parent=None): return self.monosaccharide_from_cfg(cfg_str, parent=parent) def finalize(self, glycan): pass class SpacerDeserializer(object): pattern = re.compile(r""" -(?P<spacer_name>[A-Za-z]+) (?P<linkage>\d+)$ """, re.VERBOSE | re.UNICODE) def has_pattern(self, string): return self.pattern.search(string) def remove_pattern(self, cfg_str): return self.pattern.sub("", cfg_str) def extract_pattern(self, cfg_str): match = self.pattern.search(cfg_str) if match is None: raise CFGError("Cannot find spacer pattern in {}".format(cfg_str)) match_dict = match.groupdict() return match_dict def spacer_from_cfg(self, cfg_str): return class GlycanDeserializer(object): def __init__(self, monosaccharide_deserializer=None, set_default_positions=True): if monosaccharide_deserializer is None: monosaccharide_deserializer = MonosaccharideDeserializer() self.monosaccharide_deserializer = monosaccharide_deserializer self.set_default_positions = set_default_positions self.spacer_deserializer = SpacerDeserializer() new_branch_open = re.compile(r"(\))$") def add_monosaccharide(self, parent_node, child_node, linkage): self.monosaccharide_deserializer.add_monosaccharide_bond( child_node, parent_node, linkage) def glycan_from_cfg(self, text, structure_class=Glycan, **kwargs): last_outedge = None root = None last_residue = None branch_stack = [] # Remove the base if self.spacer_deserializer.has_pattern(text): text = self.spacer_deserializer.remove_pattern(text) while len(text) > 0: # If starting a new branch match = self.new_branch_open.search(text) if match is not None: step = match.end(1) - match.start(1) text = text[:-step] branch_stack.append((last_residue, root, last_outedge)) root = None last_residue = None last_outedge = None # If ending a branch elif text[-1] == '(': try: branch_parent, old_root, old_last_outedge = branch_stack.pop() # child_position, parent_position = last_outedge self.add_monosaccharide(branch_parent, root, last_outedge) root = old_root last_residue = branch_parent last_outedge = old_last_outedge text = text[:-1] except IndexError: raise CFGError("Bad branching at {}".format(len(text))) # Parsing a residue else: match = self.monosaccharide_deserializer.has_pattern(text) if match: next_residue, outedge = self.monosaccharide_deserializer( text[match.start(): match.end()], last_residue) if root is None: last_outedge = outedge root = next_residue last_residue = next_residue text = text[:match.start()] else: raise CFGError("Could not identify residue '...{}' at {}".format(text[-30:], len(text))) res = structure_class(root=root) self.monosaccharide_deserializer.finalize(res) res.reindex() if self.set_default_positions: self.set_default_positions_for_common_cases(res) return res def __call__(self, text, **kwargs): return self.glycan_from_cfg(text, **kwargs) def set_default_positions_for_common_cases(self, glycan): for node in glycan: candidates = [] for pos, link in list(node.substituent_links.items()): if pos == UnknownPosition: candidates.append(link) for candidate in candidates: substituent = candidate.to(node) position = None if substituent.name == 'n_acetyl': if node.superclass == SuperClass.hex: position = 2 elif node.superclass == SuperClass.non: position = 5 if position is None: continue if not node.is_occupied(position): candidate.break_link(refund=True) candidate.parent_position = position candidate.apply() return glycan glycan_parser = GlycanDeserializer()
[docs]def loads(text): '''Parse a single CFG glycan sequence. .. note:: The spacer, if any, is ignored. Parameters ---------- text : str The sequence to parse Returns ------- structure : :class:`~.Glycan` The parsed glycan structure ''' return glycan_parser(text)