May-14-2024, 04:52 PM
I wrote the below code to parse a Google-like query string with the help of the great pyparsing library. The code examples also show which expressions works and which (faulty) expressions fail. In the end I want the parser to be completely fault tolerant and just ignore the errors (but still come up with a parsing tree). I wonder if I should just preprocess the query string or adapt the grammar for that (which seems to be quite hard for me). Any pyparsing experts here that can give me an advice?
import pyparsing as pp from typing import Literal class TermNode: def __init__(self, term_type: Literal["WORD", "PHRASE"], value: str): self.term_type = term_type self.value = value def __repr__(self): return f"TermNode({self.term_type}, {self.value})" class UnaryNode: def __init__(self, operator: Literal["NOT"], operand: str): self.operator = operator self.operand = operand def __repr__(self): return f"UnaryNode({self.operator}, {self.operand})" class BinaryNode: def __init__(self, operator: Literal["AND", "OR"], left: str, right: str): self.operator = operator self.left = left self.right = right def __repr__(self): return f"BinaryNode({self.operator}, {self.left}, {self.right})" not_ = pp.Literal("-") and_ = pp.Keyword("AND") or_ = pp.Keyword("OR") lparen = pp.Literal("(") rparen = pp.Literal(")") word = ~(and_ | or_) + pp.Word(pp.alphanums + pp.alphas8bit).set_parse_action(lambda t: TermNode("WORD", t[0])) phrase = pp.QuotedString(quoteChar='"').set_parse_action(lambda t: TermNode("PHRASE", t[0])) term = (phrase | word) or_expression = pp.Forward() parens_expression = pp.Forward() parens_expression <<= (pp.Suppress(lparen) + or_expression + pp.Suppress(rparen)) | term not_expression = pp.Forward() not_expression <<= (not_ + not_expression).set_parse_action(lambda t: UnaryNode("NOT", t[1])) | parens_expression and_expression = pp.Forward() and_expression <<= (not_expression + and_ + and_expression).set_parse_action(lambda t: BinaryNode("AND", t[0], t[2])) | (not_expression + and_expression).set_parse_action(lambda t: BinaryNode("AND", t[0], t[1])) | not_expression or_expression <<= (and_expression + or_ + or_expression).set_parse_action(lambda t: BinaryNode("OR", t[0], t[2])) | and_expression #or_expression.parse_string('', parse_all=True) or_expression.run_tests("""\ # Word term foobar # Umlaute in word term Gürtel # Phrase term "foo bar" # Special characters in phrase "foo!~ bar %" # Implicit AND foo bar # Explicit AND foo AND bar # Explicit OR foo OR bar # NOT -foo # Parenthesis foo AND (bar OR baz) # Complex expression 1 -foo AND ("bar baz" OR qux) # Complex expression 2 foo AND (-"bar baz" (moo OR zoo) AND yoo) # Complex expression 3 foo (bar -"baz moo") zoo ### # Invalid expression ### # Unary before binary operator foo - AND bar # Unknown char outside quotes foo ~ bar # Binary operator at start of input AND foo # Binary operator at start of parens expression (AND bar) # Binary operator at end of input foo AND # Binary operator at end of parens expression (foo AND) # Unary operator at end of input foo - # Unary operator at end of parens expression (foo -) # Unbalanced parens ((foo) # Unbalanced quotes ""foo" """);