PoPL/02_syntax/lexer.py

#!/bin/env python3

import argparse
import datetime
import ply.lex as lex

reserved = {
    'var':          'VAR',
    'is':           'IS',
    'unless':       'UNLESS',
    'otherwise':    'OTHERWISE',
    'until':        'UNTIL',
    'do':           'DO',
    'done':         'DONE',
    'procedure':    'PROCEDURE',
    'function':     'FUNCTION',
    'return':       'RETURN',
    'print':        'PRINT',
    'end':          'END',
}

tokens = [
    'LPAREN',
    'RPAREN',
    'LSQUARE',
    'RSQUARE',
    'LCURLY',
    'RCURLY',
    'APOSTROPHE',
    'AMPERSAND',
    'COMMA',
    'DOT',
    'EQ',
    'LT',
    'PLUS',
    'MINUS',
    'MULT',
    'DIV',
    'STRING',
    'DATE_LITERAL',
    'INT_LITERAL',
    'IDENT',
    'FUNC_IDENT',
    'PROC_IDENT',
] + list(reserved.values())

def t_whitespace(t):
    r'[ \t\n]+'
    t.lexer.lineno += t.value.count('\n')

def t_comment(t):
    r'\(%(.|\n)*?%\)'
    t.lexer.lineno += t.value.count('\n')

t_LPAREN        = r'\('
t_RPAREN        = r'\)'
t_LSQUARE       = r'\['
t_RSQUARE       = r'\]'
t_LCURLY        = r'\{'
t_RCURLY        = r'\}'
t_APOSTROPHE    = r'\''
t_AMPERSAND     = r'&'
t_COMMA         = r','
t_DOT           = r'\.'
t_EQ            = r'='
t_LT            = r'<'
t_PLUS          = r'\+'
t_MINUS         = r'-'
t_MULT          = r'\*'
t_DIV           = r'/'

def t_STRING(t):
    r'".*?"'
    t.value = t.value[1:-1]
    return t

def t_DATE_LITERAL(t):
    r'\d{4}-\d{2}-\d{2}'
    try:
        t.value = datetime.date.fromisoformat(t.value)
    except:
        print(f'Invalid date \'{t.value}\' at line {t.lexer.lineno}')
        raise SystemExit
    return t

def t_INT_LITERAL(t):
    r'-?\d{1,3}(\'\d{3})*'
    t.value = int(t.value.replace('\'', ''))
    return t

def t_IDENT(t):
    r'[a-z][a-zA-Z0-9_]+'
    t.type = reserved.get(t.value, 'IDENT')
    return t

def t_FUNC_IDENT(t):
    r'[A-Z][a-z0-9_]+'
    return t

def t_PROC_IDENT(t):
    r'[A-Z]{2}[A-Z0-9_]*'
    return t

def t_error(t):
    print(f'Illegal character \'{t.value[0]}\' at line {t.lexer.lineno}')
    raise SystemExit

lexer = lex.lex()

def tokenize_file(file_path: str):
    with open(file_path, 'r', encoding='utf-8') as file:
        lexer.input(file.read())

    tok = lexer.token()
    while tok:
        print(tok)
        tok = lexer.token()

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--who', action='store_true', help='print out student IDs and NAMEs of authors')
    group.add_argument('-f', '--file', help='filename to process')

    args = parser.parse_args()
    
    if args.who:
        print('Author')
        print('  Student ID: 150189237')
        print('  Name:       Oskari Alaranta')
    else:
        tokenize_file(args.file)
Implement phase 02 syntax analysis 2024-03-04 01:54:42 +02:00			`#!/bin/env python3`

			`import argparse`
			`import datetime`
			`import ply.lex as lex`

			`reserved = {`
			`'var': 'VAR',`
			`'is': 'IS',`
			`'unless': 'UNLESS',`
			`'otherwise': 'OTHERWISE',`
			`'until': 'UNTIL',`
			`'do': 'DO',`
			`'done': 'DONE',`
			`'procedure': 'PROCEDURE',`
			`'function': 'FUNCTION',`
			`'return': 'RETURN',`
			`'print': 'PRINT',`
			`'end': 'END',`
			`}`

			`tokens = [`
			`'LPAREN',`
			`'RPAREN',`
			`'LSQUARE',`
			`'RSQUARE',`
			`'LCURLY',`
			`'RCURLY',`
			`'APOSTROPHE',`
			`'AMPERSAND',`
			`'COMMA',`
			`'DOT',`
			`'EQ',`
			`'LT',`
			`'PLUS',`
			`'MINUS',`
			`'MULT',`
			`'DIV',`
			`'STRING',`
			`'DATE_LITERAL',`
			`'INT_LITERAL',`
			`'IDENT',`
			`'FUNC_IDENT',`
			`'PROC_IDENT',`
			`] + list(reserved.values())`

			`def t_whitespace(t):`
			`r'[ \t\n]+'`
			`t.lexer.lineno += t.value.count('\n')`

			`def t_comment(t):`
Make comments greedy This allows having multiple comments 2024-03-15 08:35:17 +02:00			`r'\(%(.\|\n)*?%\)'`
Implement phase 02 syntax analysis 2024-03-04 01:54:42 +02:00			`t.lexer.lineno += t.value.count('\n')`

			`t_LPAREN = r'\('`
			`t_RPAREN = r'\)'`
			`t_LSQUARE = r'\['`
			`t_RSQUARE = r'\]'`
			`t_LCURLY = r'\{'`
			`t_RCURLY = r'\}'`
			`t_APOSTROPHE = r'\''`
			`t_AMPERSAND = r'&'`
			`t_COMMA = r','`
			`t_DOT = r'\.'`
			`t_EQ = r'='`
			`t_LT = r'<'`
			`t_PLUS = r'\+'`
			`t_MINUS = r'-'`
			`t_MULT = r'\*'`
			`t_DIV = r'/'`

			`def t_STRING(t):`
			`r'".*?"'`
			`t.value = t.value[1:-1]`
			`return t`

			`def t_DATE_LITERAL(t):`
			`r'\d{4}-\d{2}-\d{2}'`
Add exception handling for date parsing in lexer 2024-03-04 02:01:19 +02:00			`try:`
			`t.value = datetime.date.fromisoformat(t.value)`
			`except:`
			`print(f'Invalid date \'{t.value}\' at line {t.lexer.lineno}')`
			`raise SystemExit`
Implement phase 02 syntax analysis 2024-03-04 01:54:42 +02:00			`return t`

			`def t_INT_LITERAL(t):`
			`r'-?\d{1,3}(\'\d{3})*'`
			`t.value = int(t.value.replace('\'', ''))`
			`return t`

			`def t_IDENT(t):`
			`r'[a-z][a-zA-Z0-9_]+'`
			`t.type = reserved.get(t.value, 'IDENT')`
			`return t`

			`def t_FUNC_IDENT(t):`
			`r'[A-Z][a-z0-9_]+'`
			`return t`

			`def t_PROC_IDENT(t):`
			`r'[A-Z]{2}[A-Z0-9_]*'`
			`return t`

			`def t_error(t):`
			`print(f'Illegal character \'{t.value[0]}\' at line {t.lexer.lineno}')`
			`raise SystemExit`

			`lexer = lex.lex()`

			`def tokenize_file(file_path: str):`
			`with open(file_path, 'r', encoding='utf-8') as file:`
			`lexer.input(file.read())`

			`tok = lexer.token()`
			`while tok:`
			`print(tok)`
			`tok = lexer.token()`

			`if __name__ == '__main__':`
			`parser = argparse.ArgumentParser()`
			`group = parser.add_mutually_exclusive_group(required=True)`
			`group.add_argument('--who', action='store_true', help='print out student IDs and NAMEs of authors')`
			`group.add_argument('-f', '--file', help='filename to process')`

			`args = parser.parse_args()`

			`if args.who:`
			`print('Author')`
			`print(' Student ID: 150189237')`
			`print(' Name: Oskari Alaranta')`
			`else:`
			`tokenize_file(args.file)`