From 832c0999d0c027ba7bc54adc7e53c068424ef86a Mon Sep 17 00:00:00 2001
From: gedi <gediminas.morkevicius@gmail.com>
Date: Sat, 6 Jun 2015 20:07:54 +0300
Subject: [PATCH] initial commit with gherkin lexer

---
 gherkin/lexer/lexer.go      | 145 ++++++++++++++++++++++++++++
 gherkin/lexer/lexer_test.go | 185 ++++++++++++++++++++++++++++++++++++
 gherkin/lexer/matchers.go   |  14 +++
 gherkin/lexer/token.go      |  16 ++++
 gherkin/lexer/token_type.go |  64 +++++++++++++
 gherkin/parse.go            |  46 +++++++++
 6 files changed, 470 insertions(+)
 create mode 100644 gherkin/lexer/lexer.go
 create mode 100644 gherkin/lexer/lexer_test.go
 create mode 100644 gherkin/lexer/matchers.go
 create mode 100644 gherkin/lexer/token.go
 create mode 100644 gherkin/lexer/token_type.go
 create mode 100644 gherkin/parse.go

diff --git a/gherkin/lexer/lexer.go b/gherkin/lexer/lexer.go
new file mode 100644
index 0000000..48c6e17
--- /dev/null
+++ b/gherkin/lexer/lexer.go
@@ -0,0 +1,145 @@
+package lexer
+
+import (
+	"bufio"
+	"io"
+	"strings"
+	"unicode"
+)
+
+type Lexer struct {
+	reader *bufio.Reader
+	peek   *Token
+	lines  int
+}
+
+func New(r io.Reader) *Lexer {
+	return &Lexer{
+		reader: bufio.NewReader(r),
+	}
+}
+
+func (l *Lexer) Next() (t *Token) {
+	if l.peek != nil {
+		t = l.peek
+		l.peek = nil
+		return
+	}
+	return l.read()
+}
+
+func (l *Lexer) Peek() *Token {
+	if l.peek == nil {
+		l.peek = l.read()
+	}
+	return l.peek
+}
+
+func (l *Lexer) read() *Token {
+	line, err := l.reader.ReadString(byte('\n'))
+	if err != nil && len(line) == 0 {
+		return &Token{
+			Type: EOF,
+			Line: l.lines,
+		}
+	}
+	l.lines++
+	line = strings.TrimRightFunc(line, unicode.IsSpace)
+	// newline
+	if len(line) == 0 {
+		return &Token{
+			Type: NEW_LINE,
+			Line: l.lines - 1,
+		}
+	}
+	// comment
+	if m := matchers["comment"].FindStringSubmatch(line); len(m) > 0 {
+		return &Token{
+			Type:   COMMENT,
+			Indent: len(m[1]),
+			Line:   l.lines - 1,
+			Value:  m[2],
+		}
+	}
+	// pystring
+	if m := matchers["pystring"].FindStringSubmatch(line); len(m) > 0 {
+		return &Token{
+			Type:   PYSTRING,
+			Indent: len(m[1]),
+			Line:   l.lines - 1,
+		}
+	}
+	// step
+	if m := matchers["step"].FindStringSubmatch(line); len(m) > 0 {
+		tok := &Token{
+			Indent: len(m[1]),
+			Line:   l.lines - 1,
+			Value:  m[3],
+		}
+		switch m[2] {
+		case "Given":
+			tok.Type = GIVEN
+		case "When":
+			tok.Type = WHEN
+		case "Then":
+			tok.Type = THEN
+		case "And":
+			tok.Type = AND
+		case "But":
+			tok.Type = BUT
+		}
+		return tok
+	}
+	// scenario
+	if m := matchers["scenario"].FindStringSubmatch(line); len(m) > 0 {
+		return &Token{
+			Type:   SCENARIO,
+			Indent: len(m[1]),
+			Line:   l.lines - 1,
+			Value:  m[2],
+		}
+	}
+	// background
+	if m := matchers["background"].FindStringSubmatch(line); len(m) > 0 {
+		return &Token{
+			Type:   BACKGROUND,
+			Indent: len(m[1]),
+			Line:   l.lines - 1,
+		}
+	}
+	// feature
+	if m := matchers["feature"].FindStringSubmatch(line); len(m) > 0 {
+		return &Token{
+			Type:   FEATURE,
+			Indent: len(m[1]),
+			Line:   l.lines - 1,
+			Value:  m[2],
+		}
+	}
+	// tags
+	if m := matchers["tags"].FindStringSubmatch(line); len(m) > 0 {
+		return &Token{
+			Type:   TAGS,
+			Indent: len(m[1]),
+			Line:   l.lines - 1,
+			Value:  m[2],
+		}
+	}
+	// table row
+	if m := matchers["table_row"].FindStringSubmatch(line); len(m) > 0 {
+		return &Token{
+			Type:   TABLE_ROW,
+			Indent: len(m[1]),
+			Line:   l.lines - 1,
+			Value:  m[2],
+		}
+	}
+	// text
+	text := strings.TrimLeftFunc(line, unicode.IsSpace)
+	return &Token{
+		Type:   TEXT,
+		Line:   l.lines - 1,
+		Value:  text,
+		Indent: len(line) - len(text),
+	}
+}
diff --git a/gherkin/lexer/lexer_test.go b/gherkin/lexer/lexer_test.go
new file mode 100644
index 0000000..28e28b0
--- /dev/null
+++ b/gherkin/lexer/lexer_test.go
@@ -0,0 +1,185 @@
+package lexer
+
+import (
+	"strings"
+	"testing"
+)
+
+var samples = map[string]string{
+	"feature": `Feature: gherkin lexer
+  in order to run features
+  as gherkin lexer
+  I need to be able to parse a feature`,
+
+	"background": `Background:`,
+
+	"scenario": "Scenario: tokenize feature file",
+
+	"step_given": `Given a feature file`,
+
+	"step_when": `When I try to read it`,
+
+	"comment": `# an important comment`,
+
+	"step_then": `Then it should give me tokens`,
+
+	"step_given_table": `Given there are users:
+      | name | lastname | num |
+      | Jack | Sparrow  | 4   |
+      | John | Doe      | 79  |`,
+}
+
+func indent(n int, s string) string {
+	return strings.Repeat(" ", n) + s
+}
+
+func Test_feature_read(t *testing.T) {
+	l := New(strings.NewReader(samples["feature"]))
+	tok := l.Next()
+	if tok.Type != FEATURE {
+		t.Fatalf("Expected a 'feature' type, but got: '%s'", tok.Type)
+	}
+	val := "gherkin lexer"
+	if tok.Value != val {
+		t.Fatalf("Expected a token value to be '%s', but got: '%s'", val, tok.Value)
+	}
+	if tok.Line != 0 {
+		t.Fatalf("Expected a token line to be '0', but got: '%d'", tok.Line)
+	}
+	if tok.Indent != 0 {
+		t.Fatalf("Expected a token identation to be '0', but got: '%d'", tok.Indent)
+	}
+
+	tok = l.Next()
+	if tok.Type != TEXT {
+		t.Fatalf("Expected a 'text' type, but got: '%s'", tok.Type)
+	}
+	val = "in order to run features"
+	if tok.Value != val {
+		t.Fatalf("Expected a token value to be '%s', but got: '%s'", val, tok.Value)
+	}
+	if tok.Line != 1 {
+		t.Fatalf("Expected a token line to be '1', but got: '%d'", tok.Line)
+	}
+	if tok.Indent != 2 {
+		t.Fatalf("Expected a token identation to be '2', but got: '%d'", tok.Indent)
+	}
+
+	tok = l.Next()
+	if tok.Type != TEXT {
+		t.Fatalf("Expected a 'text' type, but got: '%s'", tok.Type)
+	}
+	val = "as gherkin lexer"
+	if tok.Value != val {
+		t.Fatalf("Expected a token value to be '%s', but got: '%s'", val, tok.Value)
+	}
+	if tok.Line != 2 {
+		t.Fatalf("Expected a token line to be '2', but got: '%d'", tok.Line)
+	}
+	if tok.Indent != 2 {
+		t.Fatalf("Expected a token identation to be '2', but got: '%d'", tok.Indent)
+	}
+
+	tok = l.Next()
+	if tok.Type != TEXT {
+		t.Fatalf("Expected a 'text' type, but got: '%s'", tok.Type)
+	}
+	val = "I need to be able to parse a feature"
+	if tok.Value != val {
+		t.Fatalf("Expected a token value to be '%s', but got: '%s'", val, tok.Value)
+	}
+	if tok.Line != 3 {
+		t.Fatalf("Expected a token line to be '3', but got: '%d'", tok.Line)
+	}
+	if tok.Indent != 2 {
+		t.Fatalf("Expected a token identation to be '2', but got: '%d'", tok.Indent)
+	}
+
+	tok = l.Next()
+	if tok.Type != EOF {
+		t.Fatalf("Expected an 'eof' type, but got: '%s'", tok.Type)
+	}
+}
+
+func Test_minimal_feature(t *testing.T) {
+	file := strings.Join([]string{
+		samples["feature"] + "\n",
+
+		indent(2, samples["background"]),
+		indent(4, samples["step_given"]) + "\n",
+
+		indent(2, samples["comment"]),
+		indent(2, samples["scenario"]),
+		indent(4, samples["step_given"]),
+		indent(4, samples["step_when"]),
+		indent(4, samples["step_then"]),
+	}, "\n")
+	l := New(strings.NewReader(file))
+
+	var tokens []TokenType
+	for tok := l.Next(); tok.Type != EOF; tok = l.Next() {
+		tokens = append(tokens, tok.Type)
+	}
+	expected := []TokenType{
+		FEATURE,
+		TEXT,
+		TEXT,
+		TEXT,
+		NEW_LINE,
+
+		BACKGROUND,
+		GIVEN,
+		NEW_LINE,
+
+		COMMENT,
+		SCENARIO,
+		GIVEN,
+		WHEN,
+		THEN,
+	}
+	for i := 0; i < len(expected); i++ {
+		if expected[i] != tokens[i] {
+			t.Fatalf("expected token '%s' at position: %d, is not the same as actual token: '%s'", expected[i], i, tokens[i])
+		}
+	}
+}
+
+func Test_table_row_reading(t *testing.T) {
+	file := strings.Join([]string{
+		indent(2, samples["background"]),
+		indent(4, samples["step_given_table"]),
+		indent(4, samples["step_given"]),
+	}, "\n")
+	l := New(strings.NewReader(file))
+
+	var types []TokenType
+	var values []string
+	var indents []int
+	for tok := l.Next(); tok.Type != EOF; tok = l.Next() {
+		types = append(types, tok.Type)
+		values = append(values, tok.Value)
+		indents = append(indents, tok.Indent)
+	}
+	expectedTypes := []TokenType{
+		BACKGROUND,
+		GIVEN,
+		TABLE_ROW,
+		TABLE_ROW,
+		TABLE_ROW,
+		GIVEN,
+	}
+	expectedIndents := []int{2, 4, 6, 6, 6, 4}
+	for i := 0; i < len(expectedTypes); i++ {
+		if expectedTypes[i] != types[i] {
+			t.Fatalf("expected token type '%s' at position: %d, is not the same as actual: '%s'", expectedTypes[i], i, types[i])
+		}
+	}
+	for i := 0; i < len(expectedIndents); i++ {
+		if expectedIndents[i] != indents[i] {
+			t.Fatalf("expected token indentation '%d' at position: %d, is not the same as actual: '%d'", expectedIndents[i], i, indents[i])
+		}
+	}
+	if values[2] != "| name | lastname | num |" {
+		t.Fatalf("table row value '%s' was not expected", values[2])
+	}
+}
diff --git a/gherkin/lexer/matchers.go b/gherkin/lexer/matchers.go
new file mode 100644
index 0000000..e302574
--- /dev/null
+++ b/gherkin/lexer/matchers.go
@@ -0,0 +1,14 @@
+package lexer
+
+import "regexp"
+
+var matchers = map[string]*regexp.Regexp{
+	"feature":    regexp.MustCompile("^(\\s*)Feature:\\s*(.*)"),
+	"scenario":   regexp.MustCompile("^(\\s*)Scenario:\\s*(.*)"),
+	"background": regexp.MustCompile("^(\\s*)Background:"),
+	"step":       regexp.MustCompile("^(\\s*)(Given|When|Then|And|But)\\s+(.+)"),
+	"comment":    regexp.MustCompile("^(\\s*)#(.+)"),
+	"pystring":   regexp.MustCompile("^(\\s*)\\\"\\\"\\\""),
+	"tags":       regexp.MustCompile("^(\\s*)(@.+)"),
+	"table_row":  regexp.MustCompile("^(\\s*)(\\|.+)"),
+}
diff --git a/gherkin/lexer/token.go b/gherkin/lexer/token.go
new file mode 100644
index 0000000..1537528
--- /dev/null
+++ b/gherkin/lexer/token.go
@@ -0,0 +1,16 @@
+package lexer
+
+type Token struct {
+	Type         TokenType
+	Line, Indent int
+	Value        string
+}
+
+func (t *Token) OfType(all ...TokenType) bool {
+	for _, typ := range all {
+		if typ == t.Type {
+			return true
+		}
+	}
+	return false
+}
diff --git a/gherkin/lexer/token_type.go b/gherkin/lexer/token_type.go
new file mode 100644
index 0000000..faef14a
--- /dev/null
+++ b/gherkin/lexer/token_type.go
@@ -0,0 +1,64 @@
+package lexer
+
+type TokenType int
+
+const (
+	ILLEGAL TokenType = iota
+
+	specials
+	COMMENT
+	NEW_LINE
+	EOF
+
+	elements
+	TEXT
+	TAGS
+	TABLE_ROW
+	PYSTRING
+
+	keywords
+	FEATURE
+	BACKGROUND
+	SCENARIO
+	GIVEN
+	WHEN
+	THEN
+	AND
+	BUT
+)
+
+func (t TokenType) String() string {
+	switch t {
+	case COMMENT:
+		return "comment"
+	case NEW_LINE:
+		return "new line"
+	case EOF:
+		return "end of file"
+	case TEXT:
+		return "text"
+	case TAGS:
+		return "tags"
+	case TABLE_ROW:
+		return "table row"
+	case PYSTRING:
+		return "pystring"
+	case FEATURE:
+		return "feature"
+	case BACKGROUND:
+		return "background"
+	case SCENARIO:
+		return "scenario"
+	case GIVEN:
+		return "given step"
+	case WHEN:
+		return "when step"
+	case THEN:
+		return "then step"
+	case AND:
+		return "and step"
+	case BUT:
+		return "but step"
+	}
+	return "illegal"
+}
diff --git a/gherkin/parse.go b/gherkin/parse.go
new file mode 100644
index 0000000..5cb41e3
--- /dev/null
+++ b/gherkin/parse.go
@@ -0,0 +1,46 @@
+package gherkin
+
+type Tag string
+
+type Scenario struct {
+	Steps []*Step
+	Tags  []Tag
+	Line  string
+}
+
+type Background struct {
+	Steps []*Step
+	Line  string
+}
+
+type StepType string
+
+const (
+	Given StepType = "Given"
+	When  StepType = "When"
+	Then  StepType = "Then"
+)
+
+type Step struct {
+	Line string
+	Text string
+	Type StepType
+}
+
+type Feature struct {
+	Tags        []Tag
+	Description string
+	Line        string
+	Title       string
+	Filename    string
+	Background  *Background
+	Scenarios   []*Scenario
+}
+
+// func Parse(r io.Reader) (*Feature, error) {
+// 	in := bufio.NewReader(r)
+// 	for line, err := in.ReadString(byte('\n')); err != nil; line, err = in.ReadString(byte('\n')) {
+// 		ln := strings.TrimFunc(string(line), unicode.IsSpace)
+// 	}
+// 	return nil, nil
+// }