从零开始写一个JSON解析器

JSON的语法简单,很适合上完编译原理课程之后或者学习完龙书语法分析之后的练手。

**JSON**

JSON格式

1
2
3
4
{"a" : 123 }
[1,2,3]
{"a": [1,2,3] }
{"a" : {"b" : 123 }}

基础为以上几种。有了基本的格式,可以考虑先写词法分析部分。JSON格式类似于key/value存储,其中key为string类型,value复杂一些,可包括,string,int,double,jsonobject,jsonarray,true,false,null标记。所以对每个token都要逐一分析,构造出DFA。

  • 字符串
    字符串以 “ 开头,以 “ 结束。其中需要考虑的是转义字符的忽略以及unicode的格式(\u hex hex hex hex)

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    private Token readString() throws IOException, JsonLexException {
    StringBuilder builder = new StringBuilder();
    while ( (this.at_ = read()) != '\"') {
    if (this.at_ == '\r' || this.at_ == '\n') error("string", this.at_);
    if (this.at_ == '\\') {
    //check escape char
    this.at_ = read();
    if (isEscapeChar(this.at_)) {
    builder.append('\\');
    builder.append((char) this.at_);
    } else if (this.at_ == 'u'){
    //unicode
    builder.append('\\');
    builder.append('u');
    builder.append(unicode());

    } else {
    error("string", this.at_);
    }
    } else {
    builder.append((char) this.at_);
    }
    }
    if ( this.at_ != '\"') {
    //string is not closed
    error("string[not closed]", this.at_);
    }
    return this.token_ = new Token(JsonToken.STRING, new Value(builder.toString()));
    }

    private String unicode() throws IOException, JsonLexException {
    StringBuilder builder = new StringBuilder();
    int i=0;
    for (;i<4 && isHexChar(this.at_ = read()); builder.append((char) this.at_),++i);
    if (i < 4) error("unicode", this.at_);
    return builder.toString();
    }
  • 数字
    数字包含有符号int double以及科学计数法表示稍微麻烦,但是画出DFA分析就会明了很多。

DFA
据此就可很快写出来。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
private Token readNumber(int at) throws IOException,JsonLexException {

StringBuilder builder = new StringBuilder();
int status = 0;
while (true) {
switch (status) {
case 0:
this.at_ = at;
if (this.at_ == '+' || this.at_ == '-') status = 1;
else if (Character.isDigit(this.at_)) status = 2;
else error("number", this.at_);
builder.append((char) this.at_);
break;
case 1:
this.at_ = read();
if (Character.isDigit(this.at_)) status = 2;
else error("number", this.at_);
builder.append((char) this.at_);
break;
case 2:
this.at_ = read();
if (this.at_ == '.') status = 3;
else if (Character.isDigit(this.at_)) status = 2;
else if (this.at_ == 'E' || this.at_ == 'e') status = 5;
else status = 8;
if (status != 8) {
builder.append((char) this.at_);
}
break;
case 3:
this.at_ = read();
if (Character.isDigit(this.at_)) status = 4;
else error("number", this.at_);
builder.append((char) this.at_);
break;
case 4:
this.at_ = read();
if (Character.isDigit(this.at_)) status = 4;
else if (this.at_ == 'E' || this.at_ == 'e') status = 5;
else status = 9;
if (status != 9) {
builder.append((char) this.at_);
}
break;
case 5:
this.at_ = read();
if (this.at_ == '+' || this.at_ == '-') status = 6;
else if (Character.isDigit(this.at_)) status = 7;
else error("number", this.at_);
builder.append((char) this.at_);
break;
case 6:
this.at_ = read();
if (Character.isDigit(this.at_)) status = 7;
else error("number", this.at_);
builder.append((char) this.at_);
break;
case 7:
this.at_ = read();
if (Character.isDigit(this.at_)) status = 7;
else status = 10;
if (status != 10) {
builder.append((char) this.at_);
}
break;
case 8: // int
unread(this.at_); // not a digit
return this.token_ = new Token(JsonToken.INT, new Value(Integer.valueOf(builder.toString())));
case 9: //double without 'E' or 'e'
unread(this.at_); // not a digit
return this.token_ = new Token(JsonToken.DOUBLE, new Value(Double.valueOf(builder.toString())));
case 10://double with 'E' or 'e'
unread(this.at_);// not a digit
return this.token_ = new Token(JsonToken.DOUBLE,new Value(Double.valueOf(builder.toString())));
default:
error("number", this.at_);
}
}
}
  • 其他字符
    对于其他终结符,true false null则只要匹配首字符来判断就可以了。

综上就可写出来lexer了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
public Token scan() throws IOException, JsonLexException {
//this.at_ = '\0';
while (isWhiteBlack( this.at_ = read() ));
switch ((int)this.at_) {
case '{':
return this.token_ = new Token(JsonToken.BEGIN_OBJ, new Value("{"));
case '}':
return this.token_ = new Token(JsonToken.END_OBJ, new Value("}"));
case '[':
return this.token_ = new Token(JsonToken.BEGIN_ARR, new Value("["));
case ']':
return this.token_ = new Token(JsonToken.END_ARR, new Value("]"));
case ':':
return this.token_ = new Token(JsonToken.COLON, new Value(":"));
case ',':
return this.token_ = new Token(JsonToken.COMMA, new Value(","));
case '1': case '2': case '3': case '4': case '5':
case '6': case '7': case '8': case '9': case '0':
case '-': case '+': case '.':
return this.token_ = readNumber(this.at_);
case '\"':
return this.token_ = readString();
case 'n':
return this.token_ = readNull();
case 't':
return this.token_ = readTrue();
case 'f':
return this.token_ = readFalse();
case -1:
return this.token_ = new Token(JsonToken.EOF, new Value("eof"));
default:
this.token_ = null;
error("scan->default",this.at_);
return null;
}
}
```
词法分析就完成了。
## 语法分析
> 语法分析有LL, LR等方法这里采取LL(1)分析。

首先要构造出JSON文法。根据JSON基础格式。

obj -> { members }
members -> pair members’ | eps
members’ -> , pair members’ | eps
pair -> string : value
array -> [ elem ]
elem -> value elem’ | eps
elem’ -> , value elem’ | eps
value -> obj | array | number | string | true | false | null

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
求出FIRST集和FOLLOW集,验证可知为LL(1)文法。这样就可以用递归下降来做语法分析。
之后写出SDT就可以根据SDT来编写代码了。
```java

public Json parse() throws IOException, JsonParseException, JsonLexException {
Token token = lexer_.scan();
if (token.getToken() == JsonToken.BEGIN_OBJ) {
return object();
} else if (token.getToken() == JsonToken.BEGIN_ARR) {
return array();
} else {
error("parse", token.toString());
return null;
}
}

//文法 { member }
public JsonObject object() throws IOException,
JsonParseException, JsonLexException {
Map<String, Value> map = new HashMap<>();
return new JsonObject(member(map));
}

//对应文法 pair mem' | eps
public Map<String ,Value> member(Map<String, Value> map) throws IOException,
JsonLexException, JsonParseException {
Token token = lexer_.scan();
if (token.getToken() == JsonToken.END_OBJ) { //eps
return map;
}
return member_1( pair(map) );
}

//对应文法 , pair mem' | eps
public Map<String, Value> member_1(Map<String, Value> map) throws IOException, JsonLexException, JsonParseException {
Token token = lexer_.scan();
if (token.getToken() == JsonToken.COMMA) { //,
lexer_.scan();
return member_1( pair(map) );
} else if (token.getToken() == JsonToken.END_OBJ) { //eps
return map;
} else {
error("member_1", token.toString());
return null;
}
}

//文法 string : value
private Map<String, Value> pair(Map<String, Value> map) throws IOException, JsonLexException, JsonParseException {
Token token = lexer_.peek();
if (token.getToken() == JsonToken.STRING) {
String key = (String) token.getValue().get();
token = lexer_.scan();
if (token.getToken() == JsonToken.COLON) {
lexer_.scan();
map.put(key, value());
return map;
} else {
error("pair", token.toString());
return null;
}
} else {
error("pair", token.toString());
return null;
}
}

//文法 [ elem ]
public JsonArray array() throws IOException,JsonParseException, JsonLexException {
List<Value> list = new LinkedList<>();
return new JsonArray(elem(list));
}
//文法 value elem' | eps
public List<Value> elem(List<Value> list) throws IOException,
JsonLexException,JsonParseException {
Token token = lexer_.scan();
if (token.getToken() == JsonToken.END_ARR) { // eps
return list;
} else {
list.add(value());
return elem_1(list);
}
}

// 文法 , value elem' | eps
public List<Value> elem_1(List<Value> list) throws IOException,
JsonLexException, JsonParseException {
Token token = lexer_.scan();
if (token.getToken() == JsonToken.COMMA) { // ,
lexer_.scan();
list.add(value());
return elem_1(list);
} else if (token.getToken() == JsonToken.END_ARR) { // eps
return list;
} else {
error("elem_1",token.toString());
return null;
}
}

public Value value() throws IOException, JsonLexException,
JsonParseException {
Token token = lexer_.peek();
if (isCommonValue(token.getToken())) {
return new Value(token.getValue());
} else if (token.getToken() == JsonToken.BEGIN_OBJ) {
return new Value(object());
} else if (token.getToken() == JsonToken.BEGIN_ARR) {
return new Value(array());
} else {
error("value",token.toString());
return null;
}
}

private boolean isCommonValue(JsonToken token) {
return (token == JsonToken.STRING || token == JsonToken.FALSE ||
token == JsonToken.INT || token == JsonToken.DOUBLE ||
token == JsonToken.TRUE || token == JsonToken.NULL) ? true : false;
}


public void error(String position, String value) throws JsonParseException {
throw new JsonParseException(String.format("an error occurred on Parser [%s %s]",position,value));
}

至此一个简单的解析器已经写完了,只是实现了简单的解析功能,一个完善的JAVA JSON解析器至少可以对对象序列化和反序列化。后续将会添加上这部分功能,再更新一遍 :-|

源代码:ToyJSON