Logo Search packages:      
Sourcecode: uncrustify version File versions

tokenize.py

#! /usr/bin/env python
# tokenize.py
#
# Parses a C/C++/C#/D/Java/Pawn/whatever file in an array of
# tuples (string, type)
#

# punctuator lookup table
punc_table = [
   [ '!',  25,  26, '!'   ],   #   0: '!'
   [ '#',  24,  35, '#'   ],   #   1: '#'
   [ '$',  23,   0, '$'   ],   #   2: '$'
   [ '%',  22,  36, '%'   ],   #   3: '%'
   [ '&',  21,  41, '&'   ],   #   4: '&'
   [ '(',  20,   0, '('   ],   #   5: '('
   [ ')',  19,   0, ')'   ],   #   6: ')'
   [ '*',  18,  43, '*'   ],   #   7: '*'
   [ '+',  17,  44, '+'   ],   #   8: '+'
   [ ',',  16,   0, ','   ],   #   9: ','
   [ '-',  15,  46, '-'   ],   #  10: '-'
   [ '.',  14,  50, '.'   ],   #  11: '.'
   [ '/',  13,  53, '/'   ],   #  12: '/'
   [ ':',  12,  54, ':'   ],   #  13: ':'
   [ ';',  11,   0, ';'   ],   #  14: ';'
   [ '<',  10,  56, '<'   ],   #  15: '<'
   [ '=',   9,  63, '='   ],   #  16: '='
   [ '>',   8,  65, '>'   ],   #  17: '>'
   [ '?',   7,   0, '?'   ],   #  18: '?'
   [ '[',   6,  70, '['   ],   #  19: '['
   [ ']',   5,   0, ']'   ],   #  20: ']'
   [ '^',   4,  71, '^'   ],   #  21: '^'
   [ '{',   3,   0, '{'   ],   #  22: '{'
   [ '|',   2,  72, '|'   ],   #  23: '|'
   [ '}',   1,   0, '}'   ],   #  24: '}'
   [ '~',   0,  74, '~'   ],   #  25: '~'
   [ '<',   3,  30, '!<'  ],   #  26: '!<'
   [ '=',   2,  33, '!='  ],   #  27: '!='
   [ '>',   1,  34, '!>'  ],   #  28: '!>'
   [ '~',   0,   0, '!~'  ],   #  29: '!~'
   [ '=',   1,   0, '!<=' ],   #  30: '!<='
   [ '>',   0,  32, '!<>' ],   #  31: '!<>'
   [ '=',   0,   0, '!<>='],   #  32: '!<>='
   [ '=',   0,   0, '!==' ],   #  33: '!=='
   [ '=',   0,   0, '!>=' ],   #  34: '!>='
   [ '#',   0,   0, '##'  ],   #  35: '##'
   [ ':',   2,  39, '%:'  ],   #  36: '%:'
   [ '=',   1,   0, '%='  ],   #  37: '%='
   [ '>',   0,   0, '%>'  ],   #  38: '%>'
   [ '%',   0,  40, None  ],   #  39: '%:%'
   [ ':',   0,   0, '%:%:'],   #  40: '%:%:'
   [ '&',   1,   0, '&&'  ],   #  41: '&&'
   [ '=',   0,   0, '&='  ],   #  42: '&='
   [ '=',   0,   0, '*='  ],   #  43: '*='
   [ '+',   1,   0, '++'  ],   #  44: '++'
   [ '=',   0,   0, '+='  ],   #  45: '+='
   [ '-',   2,   0, '--'  ],   #  46: '--'
   [ '=',   1,   0, '-='  ],   #  47: '-='
   [ '>',   0,  49, '->'  ],   #  48: '->'
   [ '*',   0,   0, '->*' ],   #  49: '->*'
   [ '*',   1,   0, '.*'  ],   #  50: '.*'
   [ '.',   0,  52, '..'  ],   #  51: '..'
   [ '.',   0,   0, '...' ],   #  52: '...'
   [ '=',   0,   0, '/='  ],   #  53: '/='
   [ ':',   1,   0, '::'  ],   #  54: '::'
   [ '>',   0,   0, ':>'  ],   #  55: ':>'
   [ '%',   4,   0, '<%'  ],   #  56: '<%'
   [ ':',   3,   0, '<:'  ],   #  57: '<:'
   [ '<',   2,  61, '<<'  ],   #  58: '<<'
   [ '=',   1,   0, '<='  ],   #  59: '<='
   [ '>',   0,  62, '<>'  ],   #  60: '<>'
   [ '=',   0,   0, '<<=' ],   #  61: '<<='
   [ '=',   0,   0, '<>=' ],   #  62: '<>='
   [ '=',   0,  64, '=='  ],   #  63: '=='
   [ '=',   0,   0, '===' ],   #  64: '==='
   [ '=',   1,   0, '>='  ],   #  65: '>='
   [ '>',   0,  67, '>>'  ],   #  66: '>>'
   [ '=',   1,   0, '>>=' ],   #  67: '>>='
   [ '>',   0,  69, '>>>' ],   #  68: '>>>'
   [ '=',   0,   0, '>>>='],   #  69: '>>>='
   [ ']',   0,   0, '[]'  ],   #  70: '[]'
   [ '=',   0,   0, '^='  ],   #  71: '^='
   [ '=',   1,   0, '|='  ],   #  72: '|='
   [ '|',   0,   0, '||'  ],   #  73: '||'
   [ '=',   1,   0, '~='  ],   #  74: '~='
   [ '~',   0,   0, '~~'  ],   #  75: '~~'
]

#
# Token types:
#  0 = newline
#  1 = punctuator
#  2 = integer
#  3 = float
#  4 = string
#  5 = identifier
#

class tokenizer:
      def __init__(self):
            self.tokens   = []
            self.text     = ''
            self.text_idx = 0

      def tokenize_text (self, in_text):
            self.tokens   = []
            self.text     = in_text
            self.text_idx = 0

            print in_text
            try:
                  while self.text_idx < len(self.text):
                        if self.parse_whitespace():
                              continue
                        elif self.text[self.text_idx] == '\\' and self.text[self.text_idx + 1] == '\n':
                              self.text_idx += 2
                              continue
                        elif self.parse_comment():
                              continue
                        elif self.parse_number():
                              continue
                        elif self.parse_identifier():
                              continue
                        elif self.parse_string():
                              continue
                        elif self.parse_punctuator():
                              continue
                        else:
                              print 'confused:', self.text[self.text_idx:]
                              break
            except:
                  print 'bombed'
                  raise

      def parse_whitespace(self):
            start_idx = self.text_idx
            hit_newline = False
            while self.text_idx < len(self.text):
                  if self.text[self.text_idx] in '\n\r':
                        hit_newline = True
                  elif not self.text[self.text_idx] in ' \t':
                        break
                  self.text_idx += 1

            if hit_newline:
                  self.tokens.append(('\n', 0))
            return start_idx != self.text_idx

      def parse_comment(self):
            if not self.text[self.text_idx] == '/' or not self.text[self.text_idx + 1] in '/*':
                  return False
            if self.text[self.text_idx + 1] == '/':
                  while self.text_idx < len(self.text):
                        if self.text[self.text_idx] in '\n\r':
                              break;
                        self.text_idx += 1
            else:
                  while self.text_idx < len(self.text) - 1:
                        if self.text[self.text_idx] == '*' and self.text[self.text_idx + 1] == '/':
                              self.text_idx += 2
                              break;
                        self.text_idx += 1
            return True

      def parse_identifier(self):
            if not self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ':
                  return False
            start_idx = self.text_idx
            while self.text_idx < len(self.text) and self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890':
                  self.text_idx += 1
            self.tokens.append((self.text[start_idx : self.text_idx], 5))
            return True

      def parse_string(self):
            starter = 0
            start_ch = self.text[self.text_idx]
            if start_ch == 'L':
                  starter = 1
                  start_ch = self.text[self.text_idx + 1]
            if not start_ch in '"\'':
                  return False
            start_idx = self.text_idx
            self.text_idx += starter + 1
            escaped = False
            while self.text_idx < len(self.text):
                  if escaped:
                        escaped = False
                  else:
                        if self.text[self.text_idx] == '\\':
                              escaped = True
                        elif self.text[self.text_idx] == start_ch:
                              self.text_idx += 1
                              break;
                  self.text_idx += 1

            self.tokens.append((self.text[start_idx : self.text_idx], 4))
            return True

      # Checks for punctuators
      # Returns whether a punctuator was consumed (True or False)
      def parse_punctuator(self):
            tab_idx = 0
            punc_len = 0
            saved_punc = None
            while 1:
                  pte = punc_table[tab_idx]
                  if pte[0] == self.text[self.text_idx]:
                        if pte[3] != None:
                              saved_punc = pte[3]
                        self.text_idx += 1
                        tab_idx = pte[2]
                        if tab_idx == 0:
                              break
                  elif pte[1] == 0:
                        break
                  else:
                        tab_idx += 1
            if saved_punc != None:
                  self.tokens.append((saved_punc, 1))
                  return True
            return False

      def parse_number(self):
            # A number must start with a digit or a dot followed by a digit
            ch = self.text[self.text_idx]
            if not ch.isdigit() and (ch != '.' or not self.text[self.text_idx + 1].isdigit()):
                  return False;
            token_type = 2 # integer
            if (ch == '.'):
                  token_type = 3 # float
            did_hex  = False;
            start_idx = self.text_idx

            # Check for Hex, Octal, or Binary
            # Note that only D and Pawn support binary, but who cares?
            #
            if ch == '0':
                  self.text_idx += 1
                  ch = self.text[self.text_idx].upper()
                  if ch == 'X':                # hex
                        did_hex = True
                        self.text_idx += 1
                        while self.text[self.text_idx] in '_0123456789abcdefABCDEF':
                              self.text_idx += 1
                  elif ch == 'B':              # binary
                        self.text_idx += 1
                        while self.text[self.text_idx] in '_01':
                              self.text_idx += 1
                  elif ch >= '0' and ch <= 7:  # octal (but allow decimal)
                        self.text_idx += 1
                        while self.text[self.text_idx] in '_0123456789':
                              self.text_idx += 1
                  else:
                        # either just 0 or 0.1 or 0UL, etc
                        pass
            else:
                  # Regular int or float
                  while self.text[self.text_idx] in '_0123456789':
                        self.text_idx += 1

            # Check if we stopped on a decimal point
            if self.text[self.text_idx] == '.':
                  self.text_idx += 1
                  token_type = 3 # float
                  if did_hex:
                        while self.text[self.text_idx] in '_0123456789abcdefABCDEF':
                              self.text_idx += 1
                  else:
                        while self.text[self.text_idx] in '_0123456789':
                              self.text_idx += 1

            # Check exponent
            # Valid exponents per language (not that it matters):
            # C/C++/D/Java: eEpP
            # C#/Pawn:      eE
            if self.text[self.text_idx] in 'eEpP':
                  token_type = 3 # float
                  self.text_idx += 1
                  if self.text[self.text_idx] in '+-':
                        self.text_idx += 1
                  while self.text[self.text_idx] in '_0123456789':
                        self.text_idx += 1

            # Check the suffixes
            # Valid suffixes per language (not that it matters):
            #        Integer       Float
            # C/C++: uUlL          lLfF
            # C#:    uUlL          fFdDMm
            # D:     uUL           ifFL
            # Java:  lL            fFdD
            # Pawn:  (none)        (none)
            #
            # Note that i, f, d, and m only appear in floats.
            while 1:
                  if self.text[self.text_idx] in 'tTfFdDmM':
                        token_type = 3 # float
                  elif not self.text[self.text_idx] in 'lLuU':
                        break;
                  self.text_idx += 1

            self.tokens.append((self.text[start_idx : self.text_idx], token_type))
            return True

text = """
1.23+4-3*16%2 *sin(1.e-3 + .5p32) "hello" and "hello\\"there"
123 // some comment
a = b + c;
#define abc \\
        5
d = 5 /* hello */ + 3;
"""

t=tokenizer()
t.tokenize_text(text)
print t.tokens


Generated by  Doxygen 1.6.0   Back to index