Package chianti :: Package fortranformat :: Module _lexer
[hide private]
[frames] | no frames]

Source Code for Module chianti.fortranformat._lexer

  1  import sys 
  2  IS_PYTHON3 = sys.version_info[0] >= 3 
  3  
 
  4  if IS_PYTHON3: 
  5      exec('from ._edit_descriptors import *') 
  6      exec('from ._exceptions import *') 
  7  else: 
  8      exec('from _edit_descriptors import *') 
  9      exec('from _exceptions import *') 
 10  
 
 11  # Some lexer tokens to look out for
 
 12  DIGITS = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0'] 
 13  SIGNS = ['+', '-'] 
 14  COMMA = [','] 
 15  DOT = ['.'] 
 16  WHITESPACE = [' ', '\t', '\n'] 
 17  QUOTE_CHARS = ['"', "'"] 
 18  DOUBLE_EDIT_DESCRIPTORS = ['EN', 'ES', 'TR', 'TL', 'BN', 'BZ', 'SP', 'SS'] 
 19  SINGLE_EDIT_DESCRIPTORS = ['A', 'B', 'D', 'E', 'F', 'G', 'I', 'L', 'O', 'P', 'S', 'T', 'X', 'Z', ':', '/'] 
 20  H_EDIT_DESCRIPTOR = ['H'] 
 21  LEFT_PARENS = ['('] 
 22  RIGHT_PARENS = [')'] 
 23  COLON = ':' 
 24  SLASH = '/' 
 25  
 
 26  
 
27 -def lexer(format):
28 '''Lex the FORTRAN format statement into tokens''' 29 tokens = [] 30 s = -1 31 h_chars = None 32 while True: 33 # Get the next set of characters 34 s = s + 1 35 c0, c1, c2 = _get_chars(format, s) 36 # If at end of format, end it all - aieee! 37 if c0 is None: 38 break 39 # Read in an H edit descriptor string 40 elif h_chars is not None: 41 buff = format[s:s+h_chars] 42 tokens.append(Token('QUOTED_STRING', buff)) 43 s = s + (h_chars - 1) 44 h_chars = None 45 # Skip whitespace 46 elif c0 in WHITESPACE: 47 continue 48 # Read in a quoted string 49 elif c0 in QUOTE_CHARS: 50 buff = '' 51 delim = c0 52 while True: 53 s = s + 1 54 c0, c1, c2 = _get_chars(format, s) 55 # Check if an escaped delimiter 56 if (c0 == delim) and (c1 == delim): 57 s = s + 1 58 buff = buff + delim 59 elif (c0 == delim): 60 break 61 elif c0 is None: 62 # Premature end of format 63 raise InvalidFormat('Premature end of quoted string in format') 64 else: 65 buff = buff + c0 66 tokens.append(Token('QUOTED_STRING', buff)) 67 # Read in an integer 68 elif c0 in DIGITS + SIGNS: 69 # Check sign followed by at least one digit 70 if (c0 in SIGNS) and (c1 not in DIGITS): 71 # TODO: Is whitesapce allowed between sign and digit? 72 raise InvalidFormat("Orphaned sign '%s' with no digits at position %d" % (c0, s)) 73 buff = c0 74 while True: 75 s = s + 1 76 c0, c1, c2 = _get_chars(format, s) 77 if (c0 not in DIGITS) or (c0 is None): 78 break 79 else: 80 buff = buff + c0 81 s = s - 1 82 val = int(buff) 83 if buff[0] in SIGNS: 84 tokens.append(Token('INT', val)) 85 elif val == 0: 86 tokens.append(Token('UINT', val)) 87 else: 88 tokens.append(Token('NZUINT', val)) 89 # Read in a comma 90 elif c0 in COMMA: 91 tokens.append(Token('COMMA', None)) 92 # Read in a dot 93 elif c0 in DOT: 94 tokens.append(Token('DOT', None)) 95 # Read in double lettered edit descriptors 96 elif (c1 is not None) and ((c0 + c1).upper() in DOUBLE_EDIT_DESCRIPTORS): 97 ed_type = _get_ed_type((c0 + c1).upper()) 98 tokens.append(Token(ed_type, (c0 + c1).upper())) 99 s = s + 1 100 # Read in an H edit descriptor 101 elif c0.upper() in H_EDIT_DESCRIPTOR: 102 if (len(tokens) > 0) and (tokens[-1].type in ('NZUINT', 'UINT')): 103 h_chars = tokens[-1].value 104 tokens = tokens[:-1] 105 else: 106 raise InvalidFormat("Missing H descriptor number argument at position %d" % s) 107 # Read in single lettered edit descriptors 108 elif c0.upper() in SINGLE_EDIT_DESCRIPTORS: 109 ed_type = _get_ed_type(c0.upper()) 110 tokens.append(Token(ed_type, c0.upper())) 111 # Read in left parens 112 elif c0 in LEFT_PARENS: 113 tokens.append(Token('LEFT_PARENS', None)) 114 # Read in right parens 115 elif c0 in RIGHT_PARENS: 116 tokens.append(Token('RIGHT_PARENS', None)) 117 else: 118 raise InvalidFormat('Character %s not recognised at position %d' % (c0, s)) 119 return tokens
120
121 -def _get_ed_type(ed_string):
122 if ed_string in ED1: 123 ed_type = 'ED1' 124 elif ed_string in ED2: 125 ed_type = 'ED2' 126 elif ed_string in ED3: 127 ed_type = 'ED3' 128 elif ed_string in ED4: 129 ed_type = 'ED4' 130 elif ed_string in ED5: 131 ed_type = 'ED5' 132 elif ed_string in ED6: 133 ed_type = 'ED6' 134 elif ed_string in ED7: 135 ed_type = 'ED7' 136 elif ed_string in ED8: 137 ed_type = 'ED8' 138 elif ed_string in ED9: 139 ed_type = 'ED9' 140 elif ed_string in ED10: 141 ed_type = 'ED10' 142 else: 143 ed_type = None 144 return ed_type
145
146 -def _get_chars(format, s):
147 try: 148 c0 = format[s] 149 except IndexError: 150 c0 = None 151 try: 152 c1 = format[s+1] 153 except IndexError: 154 c1 = None 155 try: 156 c2 = format[s+2] 157 except IndexError: 158 c2 = None 159 return (c0, c1, c2)
160 161
162 -class InvalidFormat(Exception):
163 - def __init__(self, value):
164 self.value = value
165 - def __str__(self):
166 return repr(self.value)
167 168
169 -class Token(object):
170 - def __init__(self, type, value):
171 self.type = type 172 self.value = value
173 - def __repr__(self):
174 return "\n Token: type=%s,\tvalue=%s" % (self.type, str(self.value))
175 176 # Do some testing when run as a module 177 178 #if __name__ == '__main__': 179 # import doctest 180 # import os 181 # globs = {'lexer' : lexer} 182 # # Need to normalize whitespace since pasting into VIM converts tabs to 183 # # spaces 184 # doctest.testfile(os.path.join('tests', 'lexer_test.txt'), \ 185 # globs=globs, optionflags=doctest.NORMALIZE_WHITESPACE) 186