관리-도구
편집 파일: lex.py
#----------------------------------------------------------------------------- # ply: lex.py # # Author: David M. Beazley (dave@dabeaz.com) # # Copyright (C) 2001-2006, David M. Beazley # # This library is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This library is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # See the file COPYING for a complete copy of the LGPL. #----------------------------------------------------------------------------- __version__ = "2.2" import re, sys, types from . import util import collections # Regular expression used to match valid token names _is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') # Available instance types. This is used when parsers are defined by a class. # In Python3 the InstanceType and ObjectType are no more, they've passed, ceased # to be, they are ex-classes along with old-style classes try: _INSTANCETYPE = (types.InstanceType, types.ObjectType) except AttributeError: _INSTANCETYPE = object # Exception thrown when invalid token encountered and no default error # handler is defined. class LexError(Exception): def __init__(self,message,s): self.args = (message,) self.text = s # Token class class LexToken(object): def __str__(self): return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos) def __repr__(self): return str(self) def skip(self,n): self.lexer.skip(n) # ----------------------------------------------------------------------------- # Lexer class # # This class encapsulates all of the methods and data associated with a lexer. # # input() - Store a new string in the lexer # token() - Get the next token # ----------------------------------------------------------------------------- class Lexer: def __init__(self): self.lexre = None # Master regular expression. This is a list of # tuples (re,findex) where re is a compiled # regular expression and findex is a list # mapping regex group numbers to rules self.lexretext = None # Current regular expression strings self.lexstatere = {} # Dictionary mapping lexer states to master regexs self.lexstateretext = {} # Dictionary mapping lexer states to regex strings self.lexstate = "INITIAL" # Current lexer state self.lexstatestack = [] # Stack of lexer states self.lexstateinfo = None # State information self.lexstateignore = {} # Dictionary of ignored characters for each state self.lexstateerrorf = {} # Dictionary of error functions for each state self.lexreflags = 0 # Optional re compile flags self.lexdata = None # Actual input data (as a string) self.lexpos = 0 # Current position in input text self.lexlen = 0 # Length of the input text self.lexerrorf = None # Error rule (if any) self.lextokens = None # List of valid tokens self.lexignore = "" # Ignored characters self.lexliterals = "" # Literal characters that can be passed through self.lexmodule = None # Module self.lineno = 1 # Current line number self.lexdebug = 0 # Debugging mode self.lexoptimize = 0 # Optimized mode def clone(self,object=None): c = Lexer() c.lexstatere = self.lexstatere c.lexstateinfo = self.lexstateinfo c.lexstateretext = self.lexstateretext c.lexstate = self.lexstate c.lexstatestack = self.lexstatestack c.lexstateignore = self.lexstateignore c.lexstateerrorf = self.lexstateerrorf c.lexreflags = self.lexreflags c.lexdata = self.lexdata c.lexpos = self.lexpos c.lexlen = self.lexlen c.lextokens = self.lextokens c.lexdebug = self.lexdebug c.lineno = self.lineno c.lexoptimize = self.lexoptimize c.lexliterals = self.lexliterals c.lexmodule = self.lexmodule # If the object parameter has been supplied, it means we are attaching the # lexer to a new object. In this case, we have to rebind all methods in # the lexstatere and lexstateerrorf tables. if object: newtab = { } for key, ritem in self.lexstatere.items(): newre = [] for cre, findex in ritem: newfindex = [] for f in findex: if not f or not f[0]: newfindex.append(f) continue newfindex.append((getattr(object,f[0].__name__),f[1])) newre.append((cre,newfindex)) newtab[key] = newre c.lexstatere = newtab c.lexstateerrorf = { } for key, ef in self.lexstateerrorf.items(): c.lexstateerrorf[key] = getattr(object,ef.__name__) c.lexmodule = object # Set up other attributes c.begin(c.lexstate) return c # ------------------------------------------------------------ # writetab() - Write lexer information to a table file # ------------------------------------------------------------ def writetab(self,tabfile): tf = open(tabfile+".py","w") tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__)) tf.write("_lextokens = %s\n" % repr(self.lextokens)) tf.write("_lexreflags = %s\n" % repr(self.lexreflags)) tf.write("_lexliterals = %s\n" % repr(self.lexliterals)) tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo)) tabre = { } for key, lre in self.lexstatere.items(): titem = [] for i in range(len(lre)): titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1]))) tabre[key] = titem tf.write("_lexstatere = %s\n" % repr(tabre)) tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore)) taberr = { } for key, ef in self.lexstateerrorf.items(): if ef: taberr[key] = ef.__name__ else: taberr[key] = None tf.write("_lexstateerrorf = %s\n" % repr(taberr)) tf.close() # ------------------------------------------------------------ # readtab() - Read lexer information from a tab file # ------------------------------------------------------------ def readtab(self,tabfile,fdict): exec("import %s as lextab" % tabfile) self.lextokens = lextab._lextokens self.lexreflags = lextab._lexreflags self.lexliterals = lextab._lexliterals self.lexstateinfo = lextab._lexstateinfo self.lexstateignore = lextab._lexstateignore self.lexstatere = { } self.lexstateretext = { } for key,lre in lextab._lexstatere.items(): titem = [] txtitem = [] for i in range(len(lre)): titem.append((re.compile(lre[i][0],lextab._lexreflags),_names_to_funcs(lre[i][1],fdict))) txtitem.append(lre[i][0]) self.lexstatere[key] = titem self.lexstateretext[key] = txtitem self.lexstateerrorf = { } for key,ef in lextab._lexstateerrorf.items(): self.lexstateerrorf[key] = fdict[ef] self.begin('INITIAL') # ------------------------------------------------------------ # input() - Push a new string into the lexer # ------------------------------------------------------------ def input(self,s): if not (isinstance(s,util.bytes_type) or isinstance(s, util.string_type)): raise ValueError("Expected a string") self.lexdata = s self.lexpos = 0 self.lexlen = len(s) # ------------------------------------------------------------ # begin() - Changes the lexing state # ------------------------------------------------------------ def begin(self,state): if state not in self.lexstatere: raise ValueError("Undefined state") self.lexre = self.lexstatere[state] self.lexretext = self.lexstateretext[state] self.lexignore = self.lexstateignore.get(state,"") self.lexerrorf = self.lexstateerrorf.get(state,None) self.lexstate = state # ------------------------------------------------------------ # push_state() - Changes the lexing state and saves old on stack # ------------------------------------------------------------ def push_state(self,state): self.lexstatestack.append(self.lexstate) self.begin(state) # ------------------------------------------------------------ # pop_state() - Restores the previous state # ------------------------------------------------------------ def pop_state(self): self.begin(self.lexstatestack.pop()) # ------------------------------------------------------------ # current_state() - Returns the current lexing state # ------------------------------------------------------------ def current_state(self): return self.lexstate # ------------------------------------------------------------ # skip() - Skip ahead n characters # ------------------------------------------------------------ def skip(self,n): self.lexpos += n # ------------------------------------------------------------ # token() - Return the next token from the Lexer # # Note: This function has been carefully implemented to be as fast # as possible. Don't make changes unless you really know what # you are doing # ------------------------------------------------------------ def token(self): # Make local copies of frequently referenced attributes lexpos = self.lexpos lexlen = self.lexlen lexignore = self.lexignore lexdata = self.lexdata while lexpos < lexlen: # This code provides some short-circuit code for whitespace, tabs, and other ignored characters if lexdata[lexpos] in lexignore: lexpos += 1 continue # Look for a regular expression match for lexre,lexindexfunc in self.lexre: m = lexre.match(lexdata,lexpos) if not m: continue # Set last match in lexer so that rules can access it if they want self.lexmatch = m # Create a token for return tok = LexToken() tok.value = m.group() tok.lineno = self.lineno tok.lexpos = lexpos tok.lexer = self lexpos = m.end() i = m.lastindex func,tok.type = lexindexfunc[i] self.lexpos = lexpos if not func: # If no token type was set, it's an ignored token if tok.type: return tok break # if func not callable, it means it's an ignored token if not isinstance(func, collections.Callable): break # If token is processed by a function, call it newtok = func(tok) # Every function must return a token, if nothing, we just move to next token if not newtok: lexpos = self.lexpos # This is here in case user has updated lexpos. break # Verify type of the token. If not in the token map, raise an error if not self.lexoptimize: if newtok.type not in self.lextokens: raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( func.__code__.co_filename, func.__code__.co_firstlineno, func.__name__, newtok.type),lexdata[lexpos:]) return newtok else: # No match, see if in literals if lexdata[lexpos] in self.lexliterals: tok = LexToken() tok.value = lexdata[lexpos] tok.lineno = self.lineno tok.lexer = self tok.type = tok.value tok.lexpos = lexpos self.lexpos = lexpos + 1 return tok # No match. Call t_error() if defined. if self.lexerrorf: tok = LexToken() tok.value = self.lexdata[lexpos:] tok.lineno = self.lineno tok.type = "error" tok.lexer = self tok.lexpos = lexpos self.lexpos = lexpos newtok = self.lexerrorf(tok) if lexpos == self.lexpos: # Error method didn't change text position at all. This is an error. raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) lexpos = self.lexpos if not newtok: continue return newtok self.lexpos = lexpos raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:]) self.lexpos = lexpos + 1 if self.lexdata is None: raise RuntimeError("No input string given with input()") return None # ----------------------------------------------------------------------------- # _validate_file() # # This checks to see if there are duplicated t_rulename() functions or strings # in the parser input file. This is done using a simple regular expression # match on each line in the filename. # ----------------------------------------------------------------------------- def _validate_file(filename): import os.path base,ext = os.path.splitext(filename) if ext != '.py': return 1 # No idea what the file is. Return OK try: f = open(filename) lines = f.readlines() f.close() except IOError: return 1 # Oh well fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') counthash = { } linen = 1 noerror = 1 for l in lines: m = fre.match(l) if not m: m = sre.match(l) if m: name = m.group(1) prev = counthash.get(name) if not prev: counthash[name] = linen else: print("%s:%d: Rule %s redefined. Previously defined on line %d" % (filename,linen,name,prev)) noerror = 0 linen += 1 return noerror # ----------------------------------------------------------------------------- # _funcs_to_names() # # Given a list of regular expression functions, this converts it to a list # suitable for output to a table file # ----------------------------------------------------------------------------- def _funcs_to_names(funclist): result = [] for f in funclist: if f and f[0]: result.append((f[0].__name__,f[1])) else: result.append(f) return result # ----------------------------------------------------------------------------- # _names_to_funcs() # # Given a list of regular expression function names, this converts it back to # functions. # ----------------------------------------------------------------------------- def _names_to_funcs(namelist,fdict): result = [] for n in namelist: if n and n[0]: result.append((fdict[n[0]],n[1])) else: result.append(n) return result # ----------------------------------------------------------------------------- # _form_master_re() # # This function takes a list of all of the regex components and attempts to # form the master regular expression. Given limitations in the Python re # module, it may be necessary to break the master regex into separate expressions. # ----------------------------------------------------------------------------- def _form_master_re(relist,reflags,ldict): if not relist: return [] regex = "|".join(relist) try: lexre = re.compile(regex,re.VERBOSE | reflags) # Build the index to function map for the matching engine lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1) for f,i in lexre.groupindex.items(): handle = ldict.get(f,None) if type(handle) in (types.FunctionType, types.MethodType): lexindexfunc[i] = (handle,handle.__name__[2:]) elif handle is not None: # If rule was specified as a string, we build an anonymous # callback function to carry out the action if f.find("ignore_") > 0: lexindexfunc[i] = (None,None) print("IGNORE", f) else: lexindexfunc[i] = (None, f[2:]) return [(lexre,lexindexfunc)],[regex] except Exception as e: m = int(len(relist)/2) if m == 0: m = 1 llist, lre = _form_master_re(relist[:m],reflags,ldict) rlist, rre = _form_master_re(relist[m:],reflags,ldict) return llist+rlist, lre+rre # ----------------------------------------------------------------------------- # def _statetoken(s,names) # # Given a declaration name s of the form "t_" and a dictionary whose keys are # state names, this function returns a tuple (states,tokenname) where states # is a tuple of state names and tokenname is the name of the token. For example, # calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') # ----------------------------------------------------------------------------- def _statetoken(s,names): nonstate = 1 parts = s.split("_") for i in range(1,len(parts)): if parts[i] not in names and parts[i] != 'ANY': break if i > 1: states = tuple(parts[1:i]) else: states = ('INITIAL',) if 'ANY' in states: states = tuple(names.keys()) tokenname = "_".join(parts[i:]) return (states,tokenname) # ----------------------------------------------------------------------------- # lex(module) # # Build all of the regular expression rules from definitions in the supplied module # ----------------------------------------------------------------------------- def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0): global lexer ldict = None stateinfo = { 'INITIAL' : 'inclusive'} error = 0 files = { } lexobj = Lexer() lexobj.lexdebug = debug lexobj.lexoptimize = optimize global token,input if nowarn: warn = 0 else: warn = 1 if object: module = object if module: # User supplied a module object. if isinstance(module, types.ModuleType): ldict = module.__dict__ elif isinstance(module, _INSTANCETYPE): _items = [(k,getattr(module,k)) for k in dir(module)] ldict = { } for (i,v) in _items: ldict[i] = v else: raise ValueError("Expected a module or instance") lexobj.lexmodule = module else: # No module given. We might be able to get information from the caller. try: raise RuntimeError except RuntimeError: e,b,t = sys.exc_info() f = t.tb_frame f = f.f_back # Walk out to our calling function ldict = f.f_globals # Grab its globals dictionary if optimize and lextab: try: lexobj.readtab(lextab,ldict) token = lexobj.token input = lexobj.input lexer = lexobj return lexobj except ImportError: pass # Get the tokens, states, and literals variables (if any) if (module and isinstance(module,_INSTANCETYPE)): tokens = getattr(module,"tokens",None) states = getattr(module,"states",None) literals = getattr(module,"literals","") else: tokens = ldict.get("tokens",None) states = ldict.get("states",None) literals = ldict.get("literals","") if not tokens: raise SyntaxError("lex: module does not define 'tokens'") if not (isinstance(tokens,list) or isinstance(tokens,tuple)): raise SyntaxError("lex: tokens must be a list or tuple.") # Build a dictionary of valid token names lexobj.lextokens = { } if not optimize: for n in tokens: if not _is_identifier.match(n): print("lex: Bad token name '%s'" % n) error = 1 if warn and n in lexobj.lextokens: print("lex: Warning. Token '%s' multiply defined." % n) lexobj.lextokens[n] = None else: for n in tokens: lexobj.lextokens[n] = None if debug: print("lex: tokens = '%s'" % list(lexobj.lextokens.keys())) try: for c in literals: if not (isinstance(c,util.bytes_type) or isinstance(c, util.string_type)) or len(c) > 1: print("lex: Invalid literal %s. Must be a single character" % repr(c)) error = 1 continue except TypeError: print("lex: Invalid literals specification. literals must be a sequence of characters.") error = 1 lexobj.lexliterals = literals # Build statemap if states: if not (isinstance(states,tuple) or isinstance(states,list)): print("lex: states must be defined as a tuple or list.") error = 1 else: for s in states: if not isinstance(s,tuple) or len(s) != 2: print("lex: invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')" % repr(s)) error = 1 continue name, statetype = s if isinstance(name, util.string_type): original_name = name name = util.encode_input(name) if not isinstance(name,util.bytes_type) or len(original_name) != len(name): print("lex: state name %s must be a byte string" % repr(original_name)) error = 1 continue if not (statetype == 'inclusive' or statetype == 'exclusive'): print("lex: state type for state %s must be 'inclusive' or 'exclusive'" % name) error = 1 continue if name in stateinfo: print("lex: state '%s' already defined." % name) error = 1 continue stateinfo[name] = statetype # Get a list of symbols with the t_ or s_ prefix tsymbols = [f for f in ldict.keys() if f[:2] == 't_' ] # Now build up a list of functions and a list of strings funcsym = { } # Symbols defined as functions strsym = { } # Symbols defined as strings toknames = { } # Mapping of symbols to token names for s in stateinfo.keys(): funcsym[s] = [] strsym[s] = [] ignore = { } # Ignore strings by state errorf = { } # Error functions by state if len(tsymbols) == 0: raise SyntaxError("lex: no rules of the form t_rulename are defined.") for f in tsymbols: t = ldict[f] states, tokname = _statetoken(f,stateinfo) toknames[f] = tokname if isinstance(t, collections.Callable): for s in states: funcsym[s].append((f,t)) elif (isinstance(t, util.bytes_type) or isinstance(t,util.string_type)): for s in states: strsym[s].append((f,t)) else: print("lex: %s not defined as a function or string" % f) error = 1 # Sort the functions by line number for f in funcsym.values(): f.sort(key=lambda x: x[1].__code__.co_firstlineno) # Sort the strings by regular expression length for s in strsym.values(): s.sort(key=lambda x: len(x[1])) regexs = { } # Build the master regular expressions for state in stateinfo.keys(): regex_list = [] # Add rules defined by functions first for fname, f in funcsym[state]: line = f.__code__.co_firstlineno file = f.__code__.co_filename files[file] = None tokname = toknames[fname] ismethod = isinstance(f, types.MethodType) if not optimize: nargs = f.__code__.co_argcount if ismethod: reqargs = 2 else: reqargs = 1 if nargs > reqargs: print("%s:%d: Rule '%s' has too many arguments." % (file,line,f.__name__)) error = 1 continue if nargs < reqargs: print("%s:%d: Rule '%s' requires an argument." % (file,line,f.__name__)) error = 1 continue if tokname == 'ignore': print("%s:%d: Rule '%s' must be defined as a string." % (file,line,f.__name__)) error = 1 continue if tokname == 'error': errorf[state] = f continue if f.__doc__: if not optimize: try: c = re.compile("(?P<%s>%s)" % (f.__name__,f.__doc__), re.VERBOSE | reflags) if c.match(""): print("%s:%d: Regular expression for rule '%s' matches empty string." % (file,line,f.__name__)) error = 1 continue except re.error as e: print("%s:%d: Invalid regular expression for rule '%s'. %s" % (file,line,f.__name__,e)) if '#' in f.__doc__: print("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'." % (file,line, f.__name__)) error = 1 continue if debug: print("lex: Adding rule %s -> '%s' (state '%s')" % (f.__name__,f.__doc__, state)) # Okay. The regular expression seemed okay. Let's append it to the master regular # expression we're building regex_list.append("(?P<%s>%s)" % (f.__name__,f.__doc__)) else: print("%s:%d: No regular expression defined for rule '%s'" % (file,line,f.__name__)) # Now add all of the simple rules for name,r in strsym[state]: tokname = toknames[name] if tokname == 'ignore': ignore[state] = r continue if not optimize: if tokname == 'error': raise SyntaxError("lex: Rule '%s' must be defined as a function" % name) error = 1 continue if tokname not in lexobj.lextokens and tokname.find("ignore_") < 0: print("lex: Rule '%s' defined for an unspecified token %s." % (name,tokname)) error = 1 continue try: c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | reflags) if (c.match("")): print("lex: Regular expression for rule '%s' matches empty string." % name) error = 1 continue except re.error as e: print("lex: Invalid regular expression for rule '%s'. %s" % (name,e)) if '#' in r: print("lex: Make sure '#' in rule '%s' is escaped with '\\#'." % name) error = 1 continue if debug: print("lex: Adding rule %s -> '%s' (state '%s')" % (name,r,state)) regex_list.append("(?P<%s>%s)" % (name,r)) if not regex_list: print("lex: No rules defined for state '%s'" % state) error = 1 regexs[state] = regex_list if not optimize: for f in files.keys(): if not _validate_file(f): error = 1 if error: raise SyntaxError("lex: Unable to build lexer.") # From this point forward, we're reasonably confident that we can build the lexer. # No more errors will be generated, but there might be some warning messages. # Build the master regular expressions for state in regexs.keys(): lexre, re_text = _form_master_re(regexs[state],reflags,ldict) lexobj.lexstatere[state] = lexre lexobj.lexstateretext[state] = re_text if debug: for i in range(len(re_text)): print("lex: state '%s'. regex[%d] = '%s'" % (state, i, re_text[i])) # For inclusive states, we need to add the INITIAL state for state,type in stateinfo.items(): if state != "INITIAL" and type == 'inclusive': lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL']) lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL']) lexobj.lexstateinfo = stateinfo lexobj.lexre = lexobj.lexstatere["INITIAL"] lexobj.lexretext = lexobj.lexstateretext["INITIAL"] # Set up ignore variables lexobj.lexstateignore = ignore lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","") # Set up error functions lexobj.lexstateerrorf = errorf lexobj.lexerrorf = errorf.get("INITIAL",None) if warn and not lexobj.lexerrorf: print("lex: Warning. no t_error rule is defined.") # Check state information for ignore and error rules for s,stype in stateinfo.items(): if stype == 'exclusive': if warn and s not in errorf: print("lex: Warning. no error rule is defined for exclusive state '%s'" % s) if warn and s not in ignore and lexobj.lexignore: print("lex: Warning. no ignore rule is defined for exclusive state '%s'" % s) elif stype == 'inclusive': if s not in errorf: errorf[s] = errorf.get("INITIAL",None) if s not in ignore: ignore[s] = ignore.get("INITIAL","") # Create global versions of the token() and input() functions token = lexobj.token input = lexobj.input lexer = lexobj # If in optimize mode, we write the lextab if lextab and optimize: lexobj.writetab(lextab) return lexobj # ----------------------------------------------------------------------------- # runmain() # # This runs the lexer as a main program # ----------------------------------------------------------------------------- def runmain(lexer=None,data=None): if not data: try: filename = sys.argv[1] f = open(filename) data = f.read() f.close() except IndexError: print("Reading from standard input (type EOF to end):") data = sys.stdin.read() if lexer: _input = lexer.input else: _input = input _input(data) if lexer: _token = lexer.token else: _token = token while 1: tok = _token() if not tok: break print("(%s,%r,%d,%d)" % (tok.type, tok.value, tok.lineno,tok.lexpos)) # ----------------------------------------------------------------------------- # @TOKEN(regex) # # This decorator function can be used to set the regex expression on a function # when its docstring might need to be set in an alternative way # ----------------------------------------------------------------------------- def TOKEN(r): def set_doc(f): f.__doc__ = r return f return set_doc # Alternative spelling of the TOKEN decorator Token = TOKEN