Source code for searchstringparser.lexer.general

# -*- coding: utf-8 -*-


from __future__ import (absolute_import, print_function)


"""
=================================
General Search String Token Lexer
=================================

:Authors:
    Moritz Emanuel Beber
:Date:
    2015-09-16
:Copyright:
    Copyright |c| 2015, Max Plank Institute for Molecular Genetics, all rights reserved.
:File:
    general.py

.. |c| unicode: U+A9
"""


__all__ = ["GeneralSearchStringLexer"]


import ply.lex as lex


[docs]class GeneralSearchStringLexer(object):
    """
    """

    states = (
        ("quoting", "exclusive"),
    )
    # List of token names.   This is always required
    tokens = (
       "WORD",
       "WILDCARD",
       "NOT",
       "AND",
       "OR",
       "LPAREN",
       "RPAREN",
       "SYMBOL",
       "LITERAL_QUOTE",
       "SPACE",
       "QUOTE"
    )

    # A string containing ignored characters (spaces and tabs)
    t_ignore = " \t\r\n\f\v"  # whitespace (interpreted literally not as regex)
    # Regular expression rules for simple tokens
    t_WORD = r"\w+"
    t_WILDCARD = r"\*"
    t_NOT = r"-|~|!|not|NOT"
    t_AND = r"&{1,2}|and|AND"
    t_OR = r"\|{1,2}|or|OR"
    # rules for 'quoting' state
    t_quoting_ignore = ""
    t_quoting_SYMBOL = r"([^'\"\s\\]|\\(?!'|\"))+"  # anything but other tokens

[docs]    def __init__(self, illegal="ignore", **kw_args):
        """A composite class of a ply.lex.lex.

        This is the setup step necessary before you can iterate over the tokens.

        Parameters
        ----------
        illegal : {'record', 'ignore', 'error'} (optional)
            Action to be taken when illegal characters are encountered. The
            default is to record them but continue without regarding them.
        kw_args :
            Keyword arguments are passed to the ply.lex.lex call.
        """
        super(GeneralSearchStringLexer, self).__init__()
# TODO: pick between different error handling methods
        self.lexer = lex.lex(module=self, **kw_args)
        self.parens_level = 0
        self.last_lparens = 0
        self.last_rparens = 0
        self.last_quote = None
        self._quote_start = None
        self._invalid = None
        self._invalid_pos = None

    def __getattr__(self, attr):
        return getattr(self.lexer, attr)

    def __iter__(self):
        return (tok for tok in iter(self.lexer.token, None))

[docs]    def input(self, data):
        """Add a new string to the lexer.

        This is the setup step necessary before you can iterate over the tokens.

        Parameters
        ----------
        data : str
            Any string.
        """
        self.lexer.push_state("INITIAL")
        self.parens_level = 0
        self.last_lparens = 0
        self.last_rparens = 0
        self.last_quote = None
        self._quote_start = None
        self._invalid = list()
        self._invalid_pos = list()
        self.lexer.input(data)

[docs]    def get_illegal(self):
        """Return encountered illegal characters.

        Returns
        -------
        None
            If no illegal characters occurred.
        Tuple
            A pair of lists that contain the illegal characters and
            the positions where they occurred.
        """
        if not self._invalid:
            return None
        return (self._invalid, self._invalid_pos)

    # inspect output
[docs]    def print_tokens(self, data):
        """Print all tokens in a string.

        First iterates through all tokens found and prints them to sys.stdout.
        Then prints illegal characters if any occurred.

        Parameters
        ----------
        data : str
            Any string.
        """
        self.input(data)
        for tok in self:
            print(tok)
        if self._invalid:
            print("Invalid character(s): " + ", ".join(self._invalid))
            print("at position(s): " + ", ".join([str(x) for x in self._invalid_pos]))

    # Error handling rule
    def t_ANY_error(self, t):
        self._invalid.append(t.value[0])
        self._invalid_pos.append(t.lexpos)
        t.lexer.skip(1)

    def t_LPAREN(self, t):
        r"\("
        self.parens_level += 1
        self.last_lparens = t.lexer.lexpos - 1
        return t

    def t_RPAREN(self, t):
        r"\)"
        self.parens_level -= 1
        self.last_rparens = t.lexer.lexpos - 1
        return t

    # quoting state
    def t_quoting_LITERAL_QUOTE(self, t):
        r"\\'|\\\""
        return t

    def t_QUOTE(self, t):
        r"'|\""
        self._quote_start = t.lexer.lexpos
        self.last_quote = t.lexer.lexpos - 1
        t.lexer.push_state("quoting")
        return t

    def t_quoting_QUOTE(self, t):
        r"'|\""
        # handle different quote inside quote
        if t.value != t.lexer.lexdata[self._quote_start - 1]:
            t.type = "LITERAL_QUOTE"
            return t
        self.last_quote = None
        t.lexer.pop_state()
        return t

    def t_quoting_SPACE(self, t):
        r"\s+"
        return t
Navigation

Source code for searchstringparser.lexer.general

Navigation