#!/usr/bin/python import string, os, os.path, sys, re class LexException(Exception): def __init__(self, errmsg): self.errmsg = errmsg class CFile: def __init__(self, dirname, fname): self.dirname = dirname self.fname = fname try: f = open(dirname+fname, "r") self.contents = f.read() f.close() self.current= 0 self.size = len(self.contents) self.tokens = [] self.comments = [] self.includes = [] self.preprocs = [] self.regex = {} self.actions = {} self.simpleops = {} self.keywords = [] # register stuff self.register_simpleops() self.register_regexes() except IOError, ioe: raise ioe def get_eol(self, start): """ get the end of the line of text from the contents, starting at start """ # make sure we cover the case of \ at end of line s = start while 1: if self.contents[start] == '\\': while self.contents[start] != '\n': if start == self.size: return start start+=1 start+=1 if self.contents[start] == '\n': return start+1 if start == self.size: return start start += 1 #print "get_eol returning: ", self.contents[s:start+1] def add_action(self, trigger, action): if self.actions.has_key(trigger): self.actions[trigger].append(action) else: self.actions[trigger] = [action] def add_simpleop(self, trigger, keyword, typ): if self.simpleops.has_key(trigger): self.simpleops[trigger].append(keyword) else: self.simpleops[trigger] = [keyword] if trigger in string.lowercase: self.keywords.append(keyword) def register_simpleops(self): self.add_simpleop("a", "auto", "keyword") self.add_simpleop("b", "break", "keyword") self.add_simpleop("c", "case", "keyword") self.add_simpleop("c", "char", "keyword") self.add_simpleop("c", "const", "keyword") self.add_simpleop("c", "continue", "keyword") self.add_simpleop("d", "default", "keyword") self.add_simpleop("d", "do", "keyword") self.add_simpleop("d", "double", "keyword") self.add_simpleop("e", "else", "keyword") self.add_simpleop("e", "enum", "keyword") self.add_simpleop("e", "extern", "keyword") self.add_simpleop("f", "float", "keyword") self.add_simpleop("f", "for", "keyword") self.add_simpleop("g", "goto", "keyword") self.add_simpleop("i", "if", "keyword") self.add_simpleop("i", "int", "keyword") self.add_simpleop("l", "long", "keyword") self.add_simpleop("r", "register", "keyword") self.add_simpleop("r", "return", "keyword") self.add_simpleop("s", "short", "keyword") self.add_simpleop("s", "signed", "keyword") self.add_simpleop("s", "sizeof", "keyword") self.add_simpleop("s", "static", "keyword") self.add_simpleop("s", "struct", "keyword") self.add_simpleop("s", "switch", "keyword") self.add_simpleop("t", "typedef", "keyword") self.add_simpleop("u", "union", "keyword") self.add_simpleop("u", "unsigned", "keyword") self.add_simpleop("v", "void", "keyword") self.add_simpleop("v", "volatile", "keyword") self.add_simpleop("w", "while", "keyword") self.add_simpleop(".", "...", "ellipsis") self.add_simpleop(">", ">>=", "right_assign") self.add_simpleop("<", "<<=", "left_assign") self.add_simpleop("+", "+=", "add_assign") self.add_simpleop("-", "-=", "sub_assign") self.add_simpleop("*", "*=", "mul_assign") self.add_simpleop("/", "/=", "div_assign") self.add_simpleop("%", "%=", "mod_assign") self.add_simpleop("&", "&=", "and_assign") self.add_simpleop("^", "^=", "xor_assign") self.add_simpleop("|", "|=", "or_assign") self.add_simpleop(">", ">>", "right_op") self.add_simpleop("<", "<<", "left_op") self.add_simpleop("+", "++", "inc_op") self.add_simpleop("-", "--", "dec_op") self.add_simpleop("-", "->", "ptr_op") self.add_simpleop("&", "&&", "and_op") self.add_simpleop("|", "||", "or_op") self.add_simpleop("<", "<=", "le_op") self.add_simpleop(">", ">=", "ge_op") self.add_simpleop("=", "==", "eq_op") self.add_simpleop("!", "!=", "ne_op") self.add_simpleop(";", ";", ";") self.add_simpleop("=", "=", "=") self.add_simpleop("(", "(", "(") self.add_simpleop(")", ")", ")") self.add_simpleop("[", "[", "[") self.add_simpleop("<", "<:", "[") self.add_simpleop("<", "<%", "{") self.add_simpleop("%", "%>", "}") self.add_simpleop("]", "]", "]") self.add_simpleop(":", ":", ":") self.add_simpleop(":", ":>", "]") self.add_simpleop(".", ".", ".") self.add_simpleop("&", "&", "&") self.add_simpleop("!", "!", "!") self.add_simpleop("~", "~", "~") self.add_simpleop("-", "-", "-") self.add_simpleop("+", "+", "+") self.add_simpleop("*", "*", "*") self.add_simpleop("/", "/", "/") self.add_simpleop("%", "%", "%") self.add_simpleop("<", "<", "<") self.add_simpleop(">", ">", ">") self.add_simpleop("^", "^", "^") self.add_simpleop("|", "|", "|") self.add_simpleop("?", "?", "?") self.add_simpleop("{", "{", "{") self.add_simpleop("}", "}", "}") self.add_simpleop(",", ",", ",") self.add_simpleop(".", ".", ".") def register_regexes(self): self.regex["include"] = re.compile(r'^\s*#\s*include\s+(\"|<)(((\w+)|(\w+(/\w+)+))\.(h|c))(\"|>)\s+') self.add_action("include", self.get_include) self.regex["define"] = re.compile(r'^\s*(#\s*define(.+(\\)?\s+)*)') self.add_action("define", self.default_preproc_action) self.regex["if"] = re.compile(r'^\s*#\s*if\s+\(.+\)\s+') self.add_action("if", self.default_preproc_action) self.regex["ifdef"] = re.compile(r'^\s*#\s*ifdef\s+\w+\s+') self.add_action("ifdef", self.default_preproc_action) self.regex["ifdefined"] = re.compile(r'^\s*#\s*if\s+defined\(\w+\)\s+') self.add_action("ifdefined", self.default_preproc_action) self.regex["ifndef"] = re.compile(r'^\s*#\s*ifndef\s+\w+\s+') self.add_action("ifndef", self.default_preproc_action) self.regex["else"] = re.compile(r'^\s*#\s*else\s+') self.add_action("else", self.default_preproc_action) self.regex["endif"] = re.compile(r'^s*#\s*endif\s+') self.add_action("endif", self.default_preproc_action) self.regex["e"] = re.compile(r'(E|e)(\+|\-)?\d+') self.add_action("e", self.default_token_action) self.regex["is"] = re.compile(r'(u|U|l|L)*') self.add_action("is", self.default_token_action) self.regex["identifier"] = re.compile(r'\w{1}(\w|\d)*') self.add_action("identifier", self.default_token_action) self.regex["constant_hex"] = re.compile(r'0(x|X)[a-fA-F0-9]+((u|U|l|L)*)') self.add_action("constant_hex", self.default_token_action) self.regex["constant_dec1"] = re.compile(r'(0\d+)|((0\d+)((u|U|l|L)*))') self.add_action("constant_dec1", self.default_token_action) self.regex["constant_dec2"] = re.compile(r'\d+((u|U|l|L)*)') self.add_action("constant_dec2", self.default_token_action) self.regex["constant_l1"] = re.compile(r'\w?\'.*\'') self.add_action("constant_l1", self.default_token_action) self.regex["constant_l2"] = re.compile(r'((\d+(E|e)(\+|\-)?\d+)(f|F|l|L)?)') self.add_action("constant_l2", self.default_token_action) self.regex["constant_dec3"] = re.compile(r'\d*\.\d+((E|e)(\+|\-)?\d+)?(f|F|l|L)?') self.add_action("constant_dec3", self.default_token_action) self.regex["constant_dec4"] = re.compile(r'\d+\.\d*((E|e)(\+|\-)?\d+)?(f|F|l|L)?') self.add_action("constant_dec4", self.default_token_action) self.regex["constant_l3"] = re.compile(r'\w?\".*\"') self.add_action("constant_l3", self.default_token_action) def execute_regex(self, start, trigger): """ execute a regex indexed by trigger, starting at start and return the results """ #print "executing regex ", trigger eol = self.get_eol(start) res = self.regex[trigger].match(self.contents[start:eol]) if res is not None: #print "res=", res.group() for action in self.actions[trigger]: #print "executing action associated with %s" % (trigger) rt = action(trigger, start, res) if rt is not None: self.current += (res.span()[1]-res.span()[0]) return rt return None def execute_simpleop(self, start, trigger): """ stuff we don't need regexes for - will be triggered by a simple letter (None is failure) """ # e.g. for 'c' we will have 'char', 'case', 'continue' and we will try them all # we are reversing the list to ensure that longest items come first try: #print "trigger=", trigger #print "self.simpleops[%s]=%s" % (trigger, self.simpleops[trigger]) s = [i for i in self.simpleops[trigger]] if len(s) > 1: s.reverse() for keyword in s: #print "keyword= ", keyword if self.contents[start:start+len(keyword)] == keyword: self.current += len(keyword) self.tokens.append((keyword, start, keyword)) #print "storing keyword ", keyword return trigger return None except KeyError: return None def default_token_action(self, typ, start, res): """ append to self.tokens """ spanlen = res.span()[1]-res.span()[0] #print "storing: ", self.contents[start:start+spanlen] chunk = self.contents[start:start+spanlen] if chunk in self.keywords: self.tokens.append(("keyword", start, chunk)) else: self.tokens.append((typ, start, self.contents[start:start+spanlen])) #print "default_token_action exit: self.current= ", self.current return True def get_comment(self): """ get a comment from a file """ comment = "/*" start = cnt = self.current while self.contents[cnt:cnt+2] != "*/": comment += self.contents[cnt] if cnt+1 >= self.size: raise LexException("Lex: Comment reached end of file") cnt += 1 comment += "*/" cnt += 2 self.current = cnt self.tokens.append(("comment", comment)) self.comments.append((start,comment)) def default_preproc_action(self, typ, start, res): """ append to self.preprocs """ spanlen = res.span()[1]-res.span()[0] self.preprocs.append((typ, start, self.contents[start:start+spanlen])) return True def get_include(self, typ, start, res): """ get all the #include statements so that we know what the file is using """ # make sure we update the pointer into the file spanlen = res.span()[1]-res.span()[0] self.preprocs.append(("include", start, self.contents[self.current:self.current+spanlen])) if res.group(1) == '<': s = "/usr/include/" + res.group(2) else: s = self.dirname + res.group(2) self.includes.append((start,s)) return True def parse(self): """ main parse method """ try: while self.current < self.size: ch = self.contents[self.current] #print "self.current=%d, ch=%s " % (self.current, ch) if self.contents[self.current:self.current+2] == "/*": self.get_comment() elif ch in string.lowercase+string.uppercase+"_\"\'": #print "string.lowercase or uppercase" crnt = self.current res = self.execute_regex(crnt, "identifier") or self.execute_regex(crnt, "constant_l1") or self.execute_regex(crnt, "constant_l3") if res is None: if self.execute_simpleop(self.current, ch) is None: if ch in "eE": res = self.execute_regex(self.current, ch) elif ch in "uUlL": res = self.execute_regex(self.current, "is") else: raise LexException("Character has no meaning at %d - %s" % (self.current, self.contents[self.current])) elif ch in string.digits: #print "string.digits" crnt = self.current flag = False for i in ["hex", "dec1", "dec2", "dec3", "dec4", "l2"]: if self.execute_regex(crnt, "constant_" + i) is not None: flag = True break if not flag: raise LexException("Digit has no meaning") elif ch in "!#%&()*+,-/:;<=>?[]^{}~.&|^": #print "string.special" if self.execute_simpleop(self.current, ch) is None: flag = False for i in ["include", "define", "else", "ifdef", "endif", "ifndef", "ifdefined", "if"]: if self.execute_regex(self.current, i) is not None: flag = True break # ????? if not flag: raise LexException("# found but no #include or #define at %d character in %s character" % (self.current, self.contents[self.current])) else: #print "simpleop=%s at %d" % (ch, self.current) pass elif ch in string.whitespace+"\\": self.current += 1 #print "whitespace: self.current=", self.current else: raise LexException("Lexer should not be here, position=%d, character=%s" % (self.current, self.contents[self.current])) except Exception, e: raise def filterDir(dirhash, dirname, fnames): """ feed this function to os.path.walk() to produce all c and h files in a project dir """ try: dirhash[dirname] = {} dirhash[dirname]["cfiles"] = [] dirhash[dirname]["hfiles"] = [] for fname in fnames: s = os.path.splitext(fname) if s[1] == ".c": dirhash[dirname]["cfiles"].append(fname) elif s[1] == ".h": dirhash[dirname]["hfiles"].append(fname) except Exception: raise def getProjectFiles(path): """ get the project header and c file names and directories in a hash organized by directory and then by cfiles and hfiles """ try: dirhash = {} os.path.walk(path, filterDir, dirhash) return dirhash except Exception,e: raise def getFileCount(dirhash): """ get the number of header and c files in a project directory """ hcnt = 0 ccnt = 0 for dir in dirhash.keys(): hcnt += len(dirhash[dir]["hfiles"]) ccnt += len(dirhash[dir]["cfiles"]) return (hcnt, ccnt) def processFile(dir, fname): print "processing ", dir+fname if __name__ == "__main__": if len(sys.argv) != 3: print "Usage: %s " % (sys.argv[0]) #dirhash = getProjectFiles(sys.argv[1]) #print "Project has %d header files and %d c files" % (getFileCount(dirhash)) try: cfile = CFile(sys.argv[1], sys.argv[2]) cfile.parse() print "size=", cfile.size print "includes:" for i in cfile.includes: print i print "preprocs:" for i in cfile.preprocs: print i print "tokens:" for i in cfile.tokens: print i print "comments:" for i in cfile.comments: print i except LexException, le: print le.errmsg except Exception, e: raise