Python Forum

Full Version: [pyparsing] How to make my simple parser fault tolerant
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
I wrote the below code to parse a Google-like query string with the help of the great pyparsing library. The code examples also show which expressions works and which (faulty) expressions fail. In the end I want the parser to be completely fault tolerant and just ignore the errors (but still come up with a parsing tree). I wonder if I should just preprocess the query string or adapt the grammar for that (which seems to be quite hard for me). Any pyparsing experts here that can give me an advice?

import pyparsing as pp
from typing import Literal

class TermNode:
  def __init__(self, term_type: Literal["WORD", "PHRASE"], value: str):
    self.term_type = term_type
    self.value = value

  def __repr__(self):
    return f"TermNode({self.term_type}, {self.value})"

class UnaryNode:
  def __init__(self, operator: Literal["NOT"], operand: str):
    self.operator = operator
    self.operand = operand

  def __repr__(self):
    return f"UnaryNode({self.operator}, {self.operand})"

class BinaryNode:
  def __init__(self, operator: Literal["AND", "OR"], left: str, right: str):
    self.operator = operator
    self.left = left
    self.right = right

  def __repr__(self):
    return f"BinaryNode({self.operator}, {self.left}, {self.right})"

not_ = pp.Literal("-")
and_ = pp.Keyword("AND")
or_ = pp.Keyword("OR")
lparen = pp.Literal("(")
rparen = pp.Literal(")")

word = ~(and_ | or_) + pp.Word(pp.alphanums + pp.alphas8bit).set_parse_action(lambda t: TermNode("WORD", t[0]))
phrase = pp.QuotedString(quoteChar='"').set_parse_action(lambda t: TermNode("PHRASE", t[0]))
term = (phrase | word)

or_expression = pp.Forward()

parens_expression = pp.Forward()
parens_expression <<= (pp.Suppress(lparen) + or_expression + pp.Suppress(rparen)) | term

not_expression = pp.Forward()
not_expression <<= (not_ + not_expression).set_parse_action(lambda t: UnaryNode("NOT", t[1])) | parens_expression

and_expression = pp.Forward()
and_expression <<= (not_expression + and_ + and_expression).set_parse_action(lambda t: BinaryNode("AND", t[0], t[2])) | (not_expression + and_expression).set_parse_action(lambda t: BinaryNode("AND", t[0], t[1])) | not_expression

or_expression <<= (and_expression + or_ + or_expression).set_parse_action(lambda t: BinaryNode("OR", t[0], t[2])) | and_expression

#or_expression.parse_string('', parse_all=True)
or_expression.run_tests("""\
  # Word term
  foobar

  # Umlaute in word term
  Gürtel

  # Phrase term
  "foo bar"

  # Special characters in phrase
  "foo!~ bar %"

  # Implicit AND
  foo bar

  # Explicit AND
  foo AND bar

  # Explicit OR
  foo OR bar

  # NOT
  -foo

  # Parenthesis
  foo AND (bar OR baz)

  # Complex expression 1
  -foo AND ("bar baz" OR qux)

  # Complex expression 2
  foo AND (-"bar baz" (moo OR zoo) AND yoo)

  # Complex expression 3
  foo (bar -"baz moo") zoo

  ###
  # Invalid expression
  ###

  # Unary before binary operator
  foo - AND bar

  # Unknown char outside quotes
  foo ~ bar

  # Binary operator at start of input
  AND foo

  # Binary operator at start of parens expression
  (AND bar)

  # Binary operator at end of input
  foo AND

  # Binary operator at end of parens expression
  (foo AND)

  # Unary operator at end of input
  foo -

  # Unary operator at end of parens expression
  (foo -)

  # Unbalanced parens
  ((foo)

  # Unbalanced quotes
  ""foo"
""");