
This converts the existing const parser to the basics of a Pratt parser, following the book "Writing An Interpreter In Go" by Thorsten Ball. It doesn't really do anything interesting yet, it simply converts the existing code (with existing tests) to the new structure.
239 строки
5,8 КиБ
Go
239 строки
5,8 КиБ
Go
package cgo
|
|
|
|
// This file implements a parser of a subset of the C language, just enough to
|
|
// parse common #define statements to Go constant expressions.
|
|
|
|
import (
|
|
"fmt"
|
|
"go/ast"
|
|
"go/scanner"
|
|
"go/token"
|
|
"strings"
|
|
)
|
|
|
|
var prefixParseFns map[token.Token]func(*tokenizer) (ast.Expr, *scanner.Error)
|
|
|
|
func init() {
|
|
// This must be done in an init function to avoid an initialization order
|
|
// failure.
|
|
prefixParseFns = map[token.Token]func(*tokenizer) (ast.Expr, *scanner.Error){
|
|
token.IDENT: parseIdent,
|
|
token.INT: parseBasicLit,
|
|
token.FLOAT: parseBasicLit,
|
|
token.STRING: parseBasicLit,
|
|
token.CHAR: parseBasicLit,
|
|
token.LPAREN: parseParenExpr,
|
|
}
|
|
}
|
|
|
|
// parseConst parses the given string as a C constant.
|
|
func parseConst(pos token.Pos, fset *token.FileSet, value string) (ast.Expr, *scanner.Error) {
|
|
t := newTokenizer(pos, fset, value)
|
|
expr, err := parseConstExpr(t)
|
|
t.Next()
|
|
if t.token != token.EOF {
|
|
return nil, &scanner.Error{
|
|
Pos: t.fset.Position(t.pos),
|
|
Msg: "unexpected token " + t.token.String() + ", expected end of expression",
|
|
}
|
|
}
|
|
return expr, err
|
|
}
|
|
|
|
// parseConstExpr parses a stream of C tokens to a Go expression.
|
|
func parseConstExpr(t *tokenizer) (ast.Expr, *scanner.Error) {
|
|
if t.token == token.EOF {
|
|
return nil, &scanner.Error{
|
|
Pos: t.fset.Position(t.pos),
|
|
Msg: "empty constant",
|
|
}
|
|
}
|
|
prefix := prefixParseFns[t.token]
|
|
if prefix == nil {
|
|
return nil, &scanner.Error{
|
|
Pos: t.fset.Position(t.pos),
|
|
Msg: fmt.Sprintf("unexpected token %s", t.token),
|
|
}
|
|
}
|
|
leftExpr, err := prefix(t)
|
|
return leftExpr, err
|
|
}
|
|
|
|
func parseIdent(t *tokenizer) (ast.Expr, *scanner.Error) {
|
|
return &ast.Ident{
|
|
NamePos: t.pos,
|
|
Name: "C." + t.value,
|
|
}, nil
|
|
}
|
|
|
|
func parseBasicLit(t *tokenizer) (ast.Expr, *scanner.Error) {
|
|
return &ast.BasicLit{
|
|
ValuePos: t.pos,
|
|
Kind: t.token,
|
|
Value: t.value,
|
|
}, nil
|
|
}
|
|
|
|
func parseParenExpr(t *tokenizer) (ast.Expr, *scanner.Error) {
|
|
lparen := t.pos
|
|
t.Next()
|
|
x, err := parseConstExpr(t)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
t.Next()
|
|
if t.token != token.RPAREN {
|
|
return nil, unexpectedToken(t, token.RPAREN)
|
|
}
|
|
expr := &ast.ParenExpr{
|
|
Lparen: lparen,
|
|
X: x,
|
|
Rparen: t.pos,
|
|
}
|
|
return expr, nil
|
|
}
|
|
|
|
// unexpectedToken returns an error of the form "unexpected token FOO, expected
|
|
// BAR".
|
|
func unexpectedToken(t *tokenizer, expected token.Token) *scanner.Error {
|
|
return &scanner.Error{
|
|
Pos: t.fset.Position(t.pos),
|
|
Msg: fmt.Sprintf("unexpected token %s, expected %s", t.token, expected),
|
|
}
|
|
}
|
|
|
|
// tokenizer reads C source code and converts it to Go tokens.
|
|
type tokenizer struct {
|
|
pos token.Pos
|
|
fset *token.FileSet
|
|
token token.Token
|
|
value string
|
|
buf string
|
|
}
|
|
|
|
// newTokenizer initializes a new tokenizer, positioned at the first token in
|
|
// the string.
|
|
func newTokenizer(start token.Pos, fset *token.FileSet, buf string) *tokenizer {
|
|
t := &tokenizer{
|
|
pos: start,
|
|
fset: fset,
|
|
buf: buf,
|
|
token: token.ILLEGAL,
|
|
}
|
|
t.Next() // Parse the first token.
|
|
return t
|
|
}
|
|
|
|
// Next consumes the next token in the stream. There is no return value, read
|
|
// the next token from the pos, token and value properties.
|
|
func (t *tokenizer) Next() {
|
|
t.pos += token.Pos(len(t.value))
|
|
for {
|
|
if len(t.buf) == 0 {
|
|
t.token = token.EOF
|
|
return
|
|
}
|
|
c := t.buf[0]
|
|
switch {
|
|
case c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t' || c == '\v':
|
|
// Skip whitespace.
|
|
// Based on this source, not sure whether it represents C whitespace:
|
|
// https://en.cppreference.com/w/cpp/string/byte/isspace
|
|
t.pos++
|
|
t.buf = t.buf[1:]
|
|
case c == '(' || c == ')':
|
|
// Single-character tokens.
|
|
switch c {
|
|
case '(':
|
|
t.token = token.LPAREN
|
|
case ')':
|
|
t.token = token.RPAREN
|
|
}
|
|
t.value = t.buf[:1]
|
|
t.buf = t.buf[1:]
|
|
return
|
|
case c >= '0' && c <= '9':
|
|
// Numeric constant (int, float, etc.).
|
|
// Find the last non-numeric character.
|
|
tokenLen := len(t.buf)
|
|
hasDot := false
|
|
for i, c := range t.buf {
|
|
if c == '.' {
|
|
hasDot = true
|
|
}
|
|
if c >= '0' && c <= '9' || c == '.' || c == '_' || c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z' {
|
|
tokenLen = i + 1
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
t.value = t.buf[:tokenLen]
|
|
t.buf = t.buf[tokenLen:]
|
|
if hasDot {
|
|
// Integer constants are more complicated than this but this is
|
|
// a close approximation.
|
|
// https://en.cppreference.com/w/cpp/language/integer_literal
|
|
t.token = token.FLOAT
|
|
t.value = strings.TrimRight(t.value, "f")
|
|
} else {
|
|
t.token = token.INT
|
|
t.value = strings.TrimRight(t.value, "uUlL")
|
|
}
|
|
return
|
|
case c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' || c == '_':
|
|
// Identifier. Find all remaining tokens that are part of this
|
|
// identifier.
|
|
tokenLen := len(t.buf)
|
|
for i, c := range t.buf {
|
|
if c >= '0' && c <= '9' || c >= 'A' && c <= 'Z' || c >= 'a' && c <= 'z' || c == '_' {
|
|
tokenLen = i + 1
|
|
} else {
|
|
break
|
|
}
|
|
}
|
|
t.value = t.buf[:tokenLen]
|
|
t.buf = t.buf[tokenLen:]
|
|
t.token = token.IDENT
|
|
return
|
|
case c == '"':
|
|
// String constant. Find the first '"' character that is not
|
|
// preceded by a backslash.
|
|
escape := false
|
|
tokenLen := len(t.buf)
|
|
for i, c := range t.buf {
|
|
if i != 0 && c == '"' && !escape {
|
|
tokenLen = i + 1
|
|
break
|
|
}
|
|
if !escape {
|
|
escape = c == '\\'
|
|
}
|
|
}
|
|
t.token = token.STRING
|
|
t.value = t.buf[:tokenLen]
|
|
t.buf = t.buf[tokenLen:]
|
|
return
|
|
case c == '\'':
|
|
// Char (rune) constant. Find the first '\'' character that is not
|
|
// preceded by a backslash.
|
|
escape := false
|
|
tokenLen := len(t.buf)
|
|
for i, c := range t.buf {
|
|
if i != 0 && c == '\'' && !escape {
|
|
tokenLen = i + 1
|
|
break
|
|
}
|
|
if !escape {
|
|
escape = c == '\\'
|
|
}
|
|
}
|
|
t.token = token.CHAR
|
|
t.value = t.buf[:tokenLen]
|
|
t.buf = t.buf[tokenLen:]
|
|
return
|
|
default:
|
|
t.token = token.ILLEGAL
|
|
return
|
|
}
|
|
}
|
|
}
|