json_ada_d429d7af/json/src/json-tokenizers.adb

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
--  SPDX-License-Identifier: Apache-2.0
--
--  Copyright (c) 2016 onox <denkpadje@gmail.com>
--
--  Licensed under the Apache License, Version 2.0 (the "License");
--  you may not use this file except in compliance with the License.
--  You may obtain a copy of the License at
--
--      http://www.apache.org/licenses/LICENSE-2.0
--
--  Unless required by applicable law or agreed to in writing, software
--  distributed under the License is distributed on an "AS IS" BASIS,
--  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
--  See the License for the specific language governing permissions and
--  limitations under the License.

with Ada.Characters.Latin_1;
with Ada.IO_Exceptions;
with Ada.Strings.Bounded;

package body JSON.Tokenizers is

   procedure Read_String
     (Stream     : in out Streams.Stream;
      Next_Token : out Token)
   is
      C : Character;
      Index, Length : Streams.AS.Stream_Element_Offset := 0;
      Escaped : Boolean := False;

      use type Streams.AS.Stream_Element_Offset;
      use Ada.Characters.Latin_1;
   begin
      loop
         C := Stream.Read_Character (Index);

         --  An unescaped '"' character denotes the end of the string
         exit when not Escaped and C = '"';

         Length := Length + 1;

         if Escaped then
            case C is
               when '"' | '\' | '/' | 'b' | 'f' | 'n' | 'r' | 't' =>
                  null;
               when 'u' =>
                  --  TODO Support escaped unicode
                  raise Program_Error with "Escaped unicode not supported yet";
               when others =>
                  raise Tokenizer_Error with "Unexpected escaped character in string";
            end case;
         elsif C /= '\' then
            --  Check C is not a control character
            if C in NUL .. US then
               raise Tokenizer_Error with "Unexpected control character in string";
            end if;
         end if;
         Escaped := not Escaped and C = '\';
      end loop;
      Next_Token := Token'
        (Kind => String_Token, String_Offset => Index - Length, String_Length => Length);
   end Read_String;

   procedure Test_Leading_Zeroes (First : Character; Value : String) is
      Leading_Zero_Message : constant String := "Leading zeroes in number are not allowed";
      Minus_Digit_Message  : constant String := "Expected at least one digit after - sign";
   begin
      if First = '-' then
         if Value'Length >= 3 and then Value (Value'First .. Value'First + 1) = "-0" then
            raise Tokenizer_Error with Leading_Zero_Message;
         elsif Value'Length = 1 then
            raise Tokenizer_Error with Minus_Digit_Message;
         end if;
      elsif First = '0' and Value'Length >= 2 then
         raise Tokenizer_Error with Leading_Zero_Message;
      end if;
   end Test_Leading_Zeroes;

   procedure Read_Number
     (Stream     : in out Streams.Stream;
      First      : Character;
      Next_Token : out Token)
   is
      package SB is new Ada.Strings.Bounded.Generic_Bounded_Length
        (Max => Types.Maximum_String_Length_Numbers);

      Value : SB.Bounded_String;
      C     : Character;
      Is_Float, Checked_Leading_Zeroes : Boolean := False;

      Error_Dot_Message : constant String
        := "Number must contain at least one digit after decimal point";
      Error_Exp_Message : constant String
        := "Expected optional +/- sign after e/E and then at least one digit";
      Error_Plus_Message : constant String
        := "Prefixing number with '+' character is not allowed";
      Error_One_Digit_Message : constant String
        := "Expected at least one digit after +/- sign in number";
      Error_Length_Message : constant String
        := "Number is longer than" & Types.Maximum_String_Length_Numbers'Image & " characters";

      procedure Create_Token_From_Number is
         Number : constant String := SB.To_String (Value);
      begin
         if Is_Float then
            Next_Token := Token'(Kind => Float_Token,
              Float_Value => Types.Float_Type'Value (Number));
         else
            Next_Token := Token'(Kind => Integer_Token,
              Integer_Value => Types.Integer_Type'Value (Number));
         end if;
      end Create_Token_From_Number;
   begin
      if First = '+' then
         raise Tokenizer_Error with Error_Plus_Message;
      end if;
      SB.Append (Value, First);

      --  Accept sequence of digits, including leading zeroes
      loop
         C := Stream.Read_Character;
         exit when C not in '0' .. '9';
         SB.Append (Value, C);
      end loop;

      --  Test whether value contains leading zeroes
      Test_Leading_Zeroes (First, SB.To_String (Value));
      Checked_Leading_Zeroes := True;

      --  Tokenize fraction part
      if C = '.' then
         Is_Float := True;

         --  Append the dot
         SB.Append (Value, C);

         --  Require at least one digit after decimal point
         begin
            C := Stream.Read_Character;
            if C not in '0' .. '9' then
               raise Tokenizer_Error with Error_Dot_Message;
            end if;
            SB.Append (Value, C);
         exception
            when Ada.IO_Exceptions.End_Error =>
               raise Tokenizer_Error with Error_Dot_Message;
         end;

         --  Accept sequence of digits
         loop
            C := Stream.Read_Character;
            exit when C not in '0' .. '9';
            SB.Append (Value, C);
         end loop;
      end if;

      --  Tokenize exponent part
      if C in 'e' | 'E' then
         --  Append the 'e' or 'E' character
         SB.Append (Value, C);

         begin
            C := Stream.Read_Character;
            --  Append optional '+' or '-' character
            if C in '+' | '-' then
               --  If exponent is negative, number will be a float
               if C = '-' then
                  Is_Float := True;
               end if;

               SB.Append (Value, C);

               --  Require at least one digit after +/- sign
               C := Stream.Read_Character;
               if C not in '0' .. '9' then
                  raise Tokenizer_Error with Error_One_Digit_Message;
               end if;
               SB.Append (Value, C);
            elsif C in '0' .. '9' then
               SB.Append (Value, C);
            else
               raise Tokenizer_Error with Error_Exp_Message;
            end if;
         exception
            when Ada.IO_Exceptions.End_Error =>
               raise Tokenizer_Error with Error_Exp_Message;
         end;

         --  Accept sequence of digits
         loop
            C := Stream.Read_Character;
            exit when C not in '0' .. '9';
            SB.Append (Value, C);
         end loop;
      end if;

      Create_Token_From_Number;
      Stream.Write_Character (C);
   exception
      --  End_Error is raised if the number if followed by an EOF
      when Ada.IO_Exceptions.End_Error =>
         --  Test whether value contains leading zeroes
         if not Checked_Leading_Zeroes then
            Test_Leading_Zeroes (First, SB.To_String (Value));
         end if;

         Create_Token_From_Number;
      when Ada.Strings.Length_Error =>
         raise Tokenizer_Error with Error_Length_Message;
   end Read_Number;

   procedure Read_Literal
     (Stream     : in out Streams.Stream;
      First      : Character;
      Next_Token : out Token)
   is
      package SB is new Ada.Strings.Bounded.Generic_Bounded_Length (Max => 5);

      Value : SB.Bounded_String;
      C     : Character;

      Unexpected_Literal_Message : constant String
        := "Expected literal 'true', 'false', or 'null'";

      procedure Create_Token_From_Literal is
         Literal : constant String := SB.To_String (Value);
      begin
         if Literal = "true" then
            Next_Token := Token'(Kind => Boolean_Token, Boolean_Value => True);
         elsif Literal = "false" then
            Next_Token := Token'(Kind => Boolean_Token, Boolean_Value => False);
         elsif Literal = "null" then
            Next_Token := Token'(Kind => Null_Token);
         else
            raise Tokenizer_Error with Unexpected_Literal_Message;
         end if;
      end Create_Token_From_Literal;
   begin
      SB.Append (Value, First);
      loop
         C := Stream.Read_Character;
         exit when C not in 'a' .. 'z' or else SB.Length (Value) = SB.Max_Length;
         SB.Append (Value, C);
      end loop;

      Create_Token_From_Literal;
      Stream.Write_Character (C);
   exception
      --  End_Error is raised if the literal if followed by an EOF
      when Ada.IO_Exceptions.End_Error =>
         Create_Token_From_Literal;
      when Ada.Strings.Length_Error =>
         raise Tokenizer_Error with Unexpected_Literal_Message;
   end Read_Literal;

   procedure Read_Token
     (Stream     : in out Streams.Stream;
      Next_Token : out Token;
      Expect_EOF : Boolean := False)
   is
      C : Character;

      use Ada.Characters.Latin_1;
   begin
      loop
         --  Read the first next character and decide which token it could be part of
         C := Stream.Read_Character;

         --  Skip whitespace
         exit when C not in Space | HT | LF | CR;
      end loop;

      if Expect_EOF then
         raise Tokenizer_Error with "Expected to read EOF";
      end if;

      case C is
         when '[' =>
            Next_Token := Token'(Kind => Begin_Array_Token);
         when '{' =>
            Next_Token := Token'(Kind => Begin_Object_Token);
         when ']' =>
            Next_Token := Token'(Kind => End_Array_Token);
         when '}' =>
            Next_Token := Token'(Kind => End_Object_Token);
         when ':' =>
            Next_Token := Token'(Kind => Name_Separator_Token);
         when ',' =>
            Next_Token := Token'(Kind => Value_Separator_Token);
         when '"' =>
            Read_String (Stream, Next_Token);
         when '0' .. '9' | '+' | '-' =>
            Read_Number (Stream, C, Next_Token);
         when 'a' .. 'z' =>
            Read_Literal (Stream, C, Next_Token);
         when others =>
            raise Tokenizer_Error with "Unexpected character";
      end case;
   exception
      when Ada.IO_Exceptions.End_Error =>
         if Expect_EOF then
            Next_Token := Token'(Kind => EOF_Token);
         else
            raise Tokenizer_Error with "Unexpectedly read EOF";
         end if;
   end Read_Token;

end JSON.Tokenizers;