从零开始写一个JSON解析器

JSON的语法简单，很适合上完编译原理课程之后或者学习完龙书语法分析之后的练手。

**JSON**

JSON格式

{"a" : 123 }
[1,2,3]
{"a": [1,2,3] }
{"a" : {"b" : 123 }}

基础为以上几种。有了基本的格式，可以考虑先写词法分析部分。JSON格式类似于key/value存储，其中key为string类型，value复杂一些，可包括，string，int，double，jsonobject，jsonarray，true，false，null标记。所以对每个token都要逐一分析，构造出DFA。

字符串
字符串以 “ 开头，以 “ 结束。其中需要考虑的是转义字符的忽略以及unicode的格式（\u hex hex hex hex）

private Token readString() throws IOException, JsonLexException {
       StringBuilder builder = new StringBuilder();
       while ( (this.at_ = read()) != '\"') {
           if (this.at_ == '\r' || this.at_ == '\n') error("string", this.at_);
           if (this.at_ == '\\') {
               //check escape char
               this.at_ = read();
               if (isEscapeChar(this.at_)) {
                   builder.append('\\');
                   builder.append((char) this.at_);
               } else if (this.at_ == 'u'){
                   //unicode
                   builder.append('\\');
                   builder.append('u');
                   builder.append(unicode());

               } else {
                   error("string", this.at_);
               }
           } else {
               builder.append((char) this.at_);
           }
       }
       if ( this.at_ != '\"') {
           //string is not closed
           error("string[not closed]", this.at_);
       }
       return this.token_ = new Token(JsonToken.STRING, new Value(builder.toString()));
   }

   private String unicode() throws IOException, JsonLexException {
       StringBuilder builder = new StringBuilder();
       int i=0;
       for (;i<4 && isHexChar(this.at_ = read()); builder.append((char) this.at_),++i);
       if (i < 4) error("unicode", this.at_);
       return builder.toString();
   }

数字
数字包含有符号int double以及科学计数法表示稍微麻烦，但是画出DFA分析就会明了很多。

DFA
据此就可很快写出来。

private Token readNumber(int at) throws IOException,JsonLexException {

        StringBuilder builder = new StringBuilder();
        int status = 0;
        while (true) {
            switch (status) {
                case 0:
                    this.at_ = at;
                    if (this.at_ == '+' || this.at_ == '-') status = 1;
                    else if (Character.isDigit(this.at_)) status = 2;
                    else error("number", this.at_);
                    builder.append((char) this.at_);
                    break;
                case 1:
                    this.at_ = read();
                    if (Character.isDigit(this.at_)) status = 2;
                    else error("number", this.at_);
                    builder.append((char) this.at_);
                    break;
                case 2:
                    this.at_ = read();
                    if (this.at_ == '.') status = 3;
                    else if (Character.isDigit(this.at_)) status = 2;
                    else if (this.at_ == 'E' || this.at_ == 'e') status = 5;
                    else status = 8;
                    if (status != 8) {
                        builder.append((char) this.at_);
                    }
                    break;
                case 3:
                    this.at_ = read();
                    if (Character.isDigit(this.at_)) status = 4;
                    else error("number", this.at_);
                    builder.append((char) this.at_);
                    break;
                case 4:
                    this.at_ = read();
                    if (Character.isDigit(this.at_)) status = 4;
                    else if (this.at_ == 'E' || this.at_ == 'e') status = 5;
                    else status = 9;
                    if (status != 9) {
                        builder.append((char) this.at_);
                    }
                    break;
                case 5:
                    this.at_ = read();
                    if (this.at_ == '+' || this.at_ == '-') status = 6;
                    else if (Character.isDigit(this.at_)) status = 7;
                    else error("number", this.at_);
                    builder.append((char) this.at_);
                    break;
                case 6:
                    this.at_ = read();
                    if (Character.isDigit(this.at_)) status = 7;
                    else error("number", this.at_);
                    builder.append((char) this.at_);
                    break;
                case 7:
                    this.at_ = read();
                    if (Character.isDigit(this.at_)) status = 7;
                    else status = 10;
                    if (status != 10) {
                        builder.append((char) this.at_);
                    }
                    break;
                case 8: // int
                    unread(this.at_); // not a digit
                    return this.token_ = new Token(JsonToken.INT, new Value(Integer.valueOf(builder.toString())));
                case 9: //double without 'E' or 'e'
                    unread(this.at_); // not a digit
                    return this.token_ = new Token(JsonToken.DOUBLE, new Value(Double.valueOf(builder.toString())));
                case 10://double with 'E' or 'e'
                    unread(this.at_);// not a digit
                    return this.token_ = new Token(JsonToken.DOUBLE,new Value(Double.valueOf(builder.toString())));
                default:
                    error("number", this.at_);
            }
        }
    }

其他字符
对于其他终结符，true false null则只要匹配首字符来判断就可以了。

综上就可写出来lexer了

public Token scan() throws IOException, JsonLexException {
        //this.at_ = '\0';
        while (isWhiteBlack( this.at_ = read() ));
        switch ((int)this.at_) {
            case '{':
                return this.token_ = new Token(JsonToken.BEGIN_OBJ, new Value("{"));
            case '}':
                return this.token_ = new Token(JsonToken.END_OBJ, new Value("}"));
            case '[':
                return this.token_ = new Token(JsonToken.BEGIN_ARR, new Value("["));
            case ']':
                return this.token_ = new Token(JsonToken.END_ARR, new Value("]"));
            case ':':
                return this.token_ = new Token(JsonToken.COLON, new Value(":"));
            case ',':
                return this.token_ = new Token(JsonToken.COMMA, new Value(","));
            case '1': case '2': case '3': case '4': case '5':
            case '6': case '7': case '8': case '9': case '0':
            case '-': case '+': case '.':
                return this.token_ = readNumber(this.at_);
            case '\"':
                return this.token_ = readString();
            case 'n':
                return this.token_ = readNull();
            case 't':
                return this.token_ = readTrue();
            case 'f':
                return this.token_ = readFalse();
            case -1:
                return this.token_ = new Token(JsonToken.EOF, new Value("eof"));
            default:
                this.token_ = null;
                error("scan->default",this.at_);
                return null;
        }
    }
``` 
词法分析就完成了。
## 语法分析
> 语法分析有LL， LR等方法这里采取LL(1)分析。

首先要构造出JSON文法。根据JSON基础格式。

obj -> { members }
members -> pair members’ | eps
members’ -> , pair members’ | eps
pair -> string : value
array -> [ elem ]
elem -> value elem’ | eps
elem’ -> , value elem’ | eps
value -> obj | array | number | string | true | false | null

求出FIRST集和FOLLOW集，验证可知为LL(1)文法。这样就可以用递归下降来做语法分析。
之后写出SDT就可以根据SDT来编写代码了。
```java

    public Json parse() throws IOException, JsonParseException, JsonLexException {
        Token token = lexer_.scan();
        if (token.getToken() == JsonToken.BEGIN_OBJ) {
            return object();
        } else if (token.getToken() == JsonToken.BEGIN_ARR) {
            return array();
        } else {
            error("parse", token.toString());
            return null;
        }
    }

    //文法 { member }
    public JsonObject object() throws IOException,
            JsonParseException, JsonLexException {
        Map<String, Value> map = new HashMap<>();
        return new JsonObject(member(map));
    }

    //对应文法 pair mem' | eps
    public Map<String ,Value> member(Map<String, Value> map) throws IOException,
            JsonLexException, JsonParseException {
        Token token = lexer_.scan();
        if (token.getToken() == JsonToken.END_OBJ) { //eps
            return map;
        }
        return member_1( pair(map) );
    }

    //对应文法 , pair mem' | eps
    public Map<String, Value> member_1(Map<String, Value> map) throws IOException, JsonLexException, JsonParseException {
        Token token = lexer_.scan();
        if (token.getToken() == JsonToken.COMMA) { //,
            lexer_.scan();
            return member_1( pair(map) );
        } else if (token.getToken() == JsonToken.END_OBJ) { //eps
            return map;
        } else {
            error("member_1", token.toString());
            return null;
        }
    }

    //文法 string : value
    private Map<String, Value> pair(Map<String, Value> map) throws IOException, JsonLexException, JsonParseException {
        Token token = lexer_.peek();
        if (token.getToken() == JsonToken.STRING) {
            String key = (String) token.getValue().get();
            token = lexer_.scan();
            if (token.getToken() == JsonToken.COLON) {
                lexer_.scan();
                map.put(key, value());
                return map;
            } else {
                error("pair", token.toString());
                return null;
            }
        } else {
            error("pair", token.toString());
            return null;
        }
    }

    //文法 [ elem ]
    public JsonArray array() throws IOException,JsonParseException, JsonLexException  {
        List<Value> list = new LinkedList<>();
        return new JsonArray(elem(list));
    }
    //文法 value elem' | eps
    public List<Value> elem(List<Value> list) throws IOException,
            JsonLexException,JsonParseException  {
        Token token = lexer_.scan();
        if (token.getToken() == JsonToken.END_ARR) { // eps
            return list;
        } else {
            list.add(value());
            return elem_1(list);
        }
    }

    // 文法 , value elem' | eps
    public List<Value> elem_1(List<Value> list) throws IOException,
            JsonLexException, JsonParseException  {
        Token token = lexer_.scan();
        if (token.getToken() == JsonToken.COMMA) { // ,
            lexer_.scan();
            list.add(value());
            return elem_1(list);
        } else if (token.getToken() == JsonToken.END_ARR) { // eps
            return list;
        } else {
            error("elem_1",token.toString());
            return null;
        }
    }

    public Value value() throws IOException, JsonLexException,
            JsonParseException {
        Token token = lexer_.peek();
        if (isCommonValue(token.getToken())) {
            return new Value(token.getValue());
        } else if (token.getToken() == JsonToken.BEGIN_OBJ) {
            return new Value(object());
        } else if (token.getToken() == JsonToken.BEGIN_ARR) {
            return new Value(array());
        } else {
            error("value",token.toString());
            return null;
        }
    }

    private boolean isCommonValue(JsonToken token) {
        return (token == JsonToken.STRING || token == JsonToken.FALSE ||
                token == JsonToken.INT || token == JsonToken.DOUBLE ||
                token == JsonToken.TRUE || token == JsonToken.NULL) ? true : false;
    }


    public void error(String position, String value) throws JsonParseException {
        throw new JsonParseException(String.format("an error occurred on Parser [%s %s]",position,value));
    }

至此一个简单的解析器已经写完了，只是实现了简单的解析功能，一个完善的JAVA JSON解析器至少可以对对象序列化和反序列化。后续将会添加上这部分功能，再更新一遍 :-|

源代码：ToyJSON