wisitoken_4.2.1_dc778486/wisitoken-lexer-regexp.adb

--  Abstract:
--
--  See spec
--
--  Copyright (C) 2015, 2017 - 2022 Free Software Foundation, Inc.
--
--  This file is part of the WisiToken package.
--
--  The WisiToken package is free software; you can redistribute it
--  and/or modify it under the terms of the GNU General Public License
--  as published by the Free Software Foundation; either version 3, or
--  (at your option) any later version. The WisiToken package is
--  distributed in the hope that it will be useful, but WITHOUT ANY
--  WARRANTY; without even the implied warranty of MERCHANTABILITY or
--  FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
--  License for more details. You should have received a copy of the
--  GNU General Public License distributed with the WisiToken package;
--  see file GPL.txt. If not, write to the Free Software Foundation,
--  59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
--
--  As a special exception, if other files instantiate generics from
--  this unit, or you link this unit with other files to produce an
--  executable, this unit does not by itself cause the resulting
--  executable to be covered by the GNU General Public License. This
--  exception does not however invalidate any other reasons why the
--  executable file might be covered by the GNU Public License.

pragma License (Modified_GPL);

with Ada.Characters.Latin_1;
with SAL;
package body WisiToken.Lexer.Regexp is

   function Find_Best_Match (Lexer : in out Instance) return Boolean
   is
      --  Find the longest matching character sequence in the buffer that
      --  matches a token.
      --
      --  Return True if a token is matched, False if not.

      use WisiToken.Regexp;

      Current_Char         : Integer := Lexer.Buffer_Head;
      Current_State        : Match_State;
      Current_Match_Length : Integer := 0;
      Best_Match_ID        : Token_ID;
      Best_Match_Length    : Natural := 0;
      Still_Matching       : Boolean := False;
   begin
      --  We only support Reset_With_String.

      if Current_Char > Lexer.Source.Buffer'Last then
         Lexer.ID := Lexer.Descriptor.EOI_ID;
         Lexer.Lexeme_Head := Lexer.Buffer_Head;
         Lexer.Lexeme_Tail := Lexer.Buffer_Head - 1;
         return True;
      end if;

      for I in Lexer.Syntax'Range loop
         Clear (Lexer.Syntax (I).Regexp);
      end loop;

      loop
         Still_Matching := False;

         for I in Lexer.Syntax'Range loop
            if State (Lexer.Syntax (I).Regexp) /= WisiToken.Regexp.Error then
               Current_State := Match
                 (Lexer.Syntax (I).Regexp,
                  Lexer.Source.Buffer (Lexer.Buffer_Head .. Lexer.Source.Buffer'Last),
                  Current_Char);

               case Current_State is
               when Matching =>
                  Still_Matching := True;

               when Final =>
                  Still_Matching := True;

                  Current_Match_Length := Current_Char - Lexer.Buffer_Head + 1;

                  if Best_Match_Length < Current_Match_Length then
                     Best_Match_ID  := I;
                     Best_Match_Length := Current_Match_Length;
                  end if;

               when WisiToken.Regexp.Error =>
                  null;
               end case;
            end if;
         end loop;

         exit when (not Still_Matching) or else (Current_Char = Lexer.Source.Buffer'Last);

         if Best_Match_Length = Lexer.Source.Buffer'Length then
            raise SAL.Programmer_Error with
              "token larger than buffer size of" & Integer'Image (Lexer.Source.Buffer'Length);
         end if;

         Current_Char := Current_Char + 1;
      end loop;

      if Best_Match_Length > 0 then
         Lexer.Lexeme_Head := Lexer.Buffer_Head;
         Lexer.Lexeme_Tail := Lexer.Buffer_Head + Best_Match_Length - 1;
         Lexer.ID          := Best_Match_ID;

         if Lexer.Lexeme_Head = Lexer.Source.Buffer'Last and
           Lexer.Source.Buffer (Lexer.Lexeme_Head) = Ada.Characters.Latin_1.EOT
         then
            --  matched EOF; repeat that next time
            null;
         else
            Lexer.Buffer_Head := Lexer.Lexeme_Tail + 1;
         end if;
         return True;

      elsif Current_Char = Lexer.Source.Buffer'Last then
         Lexer.ID := Lexer.Descriptor.EOI_ID;
         Lexer.Buffer_Head := Lexer.Buffer_Head + 1;
         return True;

      else
         return False;
      end if;

   end Find_Best_Match;

   ----------
   --  Public subprograms

   function Get
     (Regexp         : in String;
      Case_Sensitive : in Boolean := True;
      Report         : in Boolean := True)
      return Syntax_Item
   is begin
      return (WisiToken.Regexp.Compile (Regexp, Case_Sensitive), Report);
   end Get;

   type Instance_Access is access Instance; --  silence compiler warning

   function New_Lexer
     (Trace      : in WisiToken.Trace_Access;
      Descriptor : in WisiToken.Descriptor_Access_Constant;
      Syntax     : in WisiToken.Lexer.Regexp.Syntax)
     return WisiToken.Lexer.Handle
   is
      New_Lexer : constant Instance_Access := new Instance (Syntax'Last);
   begin
      New_Lexer.Trace      := Trace;
      New_Lexer.Descriptor := Descriptor;
      New_Lexer.Syntax     := Syntax;

      return Handle (New_Lexer);
   end New_Lexer;

   overriding procedure Finalize (Object : in out Instance)
   is begin
      Finalize (Object.Source);
   end Finalize;

   overriding procedure Reset_With_String
     (Lexer      : in out Instance;
      Input      : in     String;
      Begin_Char : in     Buffer_Pos       := Buffer_Pos'First;
      Begin_Line : in     Line_Number_Type := Line_Number_Type'First)
   is begin
      Finalize (Lexer);

      Lexer.Source :=
        (Label                     => String_Label,
         File_Name                 => +"",
         Buffer_Nominal_First_Byte => Base_Buffer_Pos (Input'First),
         Buffer_Nominal_First_Char => Begin_Char,
         Line_Nominal_First        => Begin_Line,
         Buffer                    => new String'(Input),
         Buffer_Last               => Input'Last,
         User_Buffer               => False);

      Reset (Lexer);
   end Reset_With_String;

   overriding procedure Reset_With_String_Access
     (Lexer      : in out Instance;
      Input      : in     Ada.Strings.Unbounded.String_Access;
      Input_Last : in     Integer;
      File_Name  : in     Ada.Strings.Unbounded.Unbounded_String;
      Begin_Char : in     Buffer_Pos       := Buffer_Pos'First;
      Begin_Line : in     Line_Number_Type := Line_Number_Type'First)
   is begin
      Finalize (Lexer);

      Lexer.Source :=
        (Label       => String_Label,
         File_Name   => File_Name,
         Buffer_Nominal_First_Byte => Base_Buffer_Pos (Input'First),
         Buffer_Nominal_First_Char => Begin_Char,
         Line_Nominal_First        => Begin_Line,
         Buffer      => Input,
         Buffer_Last => Input_Last,
         User_Buffer => True);

      Reset (Lexer);
   end Reset_With_String_Access;

   overriding procedure Reset_With_File
     (Lexer          : in out Instance;
      File_Name      : in     String;
      Begin_Byte_Pos : in     Buffer_Pos       := Invalid_Buffer_Pos;
      End_Byte_Pos   : in     Buffer_Pos       := Invalid_Buffer_Pos;
      Begin_Char     : in     Buffer_Pos       := Buffer_Pos'First;
      Begin_Line     : in     Line_Number_Type := Line_Number_Type'First)
   is
      pragma Unreferenced (File_Name, Begin_Byte_Pos, End_Byte_Pos, Begin_Char, Begin_Line);
   begin
      Finalize (Lexer);

      raise SAL.Not_Implemented;
   end Reset_With_File;

   overriding procedure Reset
     (Lexer : in out Instance)
   is begin
      Lexer.Lexeme_Head := Lexer.Source.Buffer'First;
      Lexer.Lexeme_Tail := Lexer.Source.Buffer'First - 1;
      Lexer.ID          := Invalid_Token_ID;
      Lexer.Buffer_Head := Lexer.Source.Buffer'First;
   end Reset;

   overriding function Find_Next
     (Lexer : in out Instance;
      Token :    out WisiToken.Lexer.Token)
     return Natural
   is begin
      loop
         if not Find_Best_Match (Lexer) then
            if Lexer.Buffer_Head > Lexer.Source.Buffer'Last then
               raise Syntax_Error with "Unrecognized EOF";
            else
               raise Syntax_Error with "Unrecognized character '" & Lexer.Source.Buffer (Lexer.Buffer_Head) & "'";
            end if;
         end if;

         exit when Lexer.Syntax (Lexer.ID).Report;

      end loop;

      Token :=
        (ID          => Lexer.ID,
         Byte_Region => (Buffer_Pos (Lexer.Lexeme_Head), Buffer_Pos (Lexer.Lexeme_Tail)),
         Char_Region => (Buffer_Pos (Lexer.Lexeme_Head), Buffer_Pos (Lexer.Lexeme_Tail)),
         Line_Region => Null_Line_Region);

      return 0;
   end Find_Next;

   overriding
   function Is_Block_Delimited
     (Lexer : in Instance;
      ID    : in Token_ID)
     return Boolean
   is
      pragma Unreferenced (Lexer, ID);
   begin
      --  regexp lexer only used in unit tests
      raise SAL.Not_Implemented;
      return False;
   end Is_Block_Delimited;

   overriding
   function Same_Block_Delimiters
     (Lexer : in Instance;
      ID    : in Token_ID)
     return Boolean
   is
      pragma Unreferenced (Lexer, ID);
   begin
      --  regexp lexer only used in unit tests
      raise SAL.Not_Implemented;
      return True;
   end Same_Block_Delimiters;

   overriding
   function Escape_Delimiter_Doubled
     (Lexer : in Instance;
      ID    : in Token_ID)
     return Boolean
   is
      pragma Unreferenced (Lexer, ID);
   begin
      --  regexp lexer only used in unit tests
      raise SAL.Not_Implemented;
      return True;
   end Escape_Delimiter_Doubled;

   overriding
   function Start_Delimiter_Length
     (Lexer : in Instance;
      ID    : in Token_ID)
     return Integer
   is
      pragma Unreferenced (Lexer, ID);
   begin
      --  regexp lexer only used in unit tests
      raise SAL.Not_Implemented;
      return Integer'Last;
   end Start_Delimiter_Length;

   overriding
   function End_Delimiter_Length
     (Lexer : in Instance;
      ID    : in Token_ID)
     return Integer
   is
      pragma Unreferenced (Lexer, ID);
   begin
      --  regexp lexer only used in unit tests
      raise SAL.Not_Implemented;
      return Integer'Last;
   end End_Delimiter_Length;

   overriding
   function New_Line_Is_End_Delimiter
     (Lexer : in Instance;
      ID    : in Token_ID)
     return Boolean
   is
      pragma Unreferenced (Lexer, ID);
   begin
      --  regexp lexer only used in unit tests
      raise SAL.Not_Implemented;
      return True;
   end New_Line_Is_End_Delimiter;

   overriding
   function Find_End_Delimiter
     (Lexer         : in Instance;
      ID            : in Token_ID;
      Comment_Start : in Buffer_Pos)
     return Buffer_Pos
   is
      pragma Unreferenced (Lexer, ID, Comment_Start);
   begin
      --  regexp lexer only used in unit tests
      raise SAL.Not_Implemented;
      return Buffer_Pos'First;
   end Find_End_Delimiter;

   overriding
   function Contains_End_Delimiter
     (Lexer  : in Instance;
      ID     : in Token_ID;
      Region : in Buffer_Region)
     return Base_Buffer_Pos
   is begin
      --  regexp lexer only used in unit tests
      raise SAL.Not_Implemented;
      return Buffer_Pos'First;
   end Contains_End_Delimiter;

   overriding
   function Find_Scan_End
     (Lexer       : in Instance;
      ID          : in Token_ID;
      Byte_Region : in Buffer_Region;
      Inserted    : in Boolean;
      Start       : in Boolean)
     return Buffer_Pos
   is
      pragma Unreferenced (Lexer, ID, Inserted, Start);
   begin
      --  regexp lexer only used in unit tests
      raise SAL.Not_Implemented;
      return Buffer_Pos'First;
   end Find_Scan_End;

   overriding
   function Line_Begin_Char_Pos
     (Lexer : in Instance;
      Token : in WisiToken.Lexer.Token;
      Line  : in Line_Number_Type)
     return Base_Buffer_Pos
   is
      pragma Unreferenced (Lexer, Token, Line);
   begin
      --  regexp lexer only used in unit tests
      raise SAL.Not_Implemented;
      return Invalid_Buffer_Pos;
   end Line_Begin_Char_Pos;

   overriding
   function Line_At_Byte_Pos
     (Lexer       : in Instance;
      Byte_Region : in WisiToken.Buffer_Region;
      Byte_Pos    : in Buffer_Pos;
      First_Line  : in Line_Number_Type)
     return Line_Number_Type
   is begin
      --  regexp lexer only used in unit tests
      raise SAL.Not_Implemented;
      return Line_Number_Type'Last;
   end Line_At_Byte_Pos;

   overriding
   function Can_Contain_New_Line
     (Lexer : in Instance;
      ID    : in Token_ID)
     return Boolean
   is
      pragma Unreferenced (Lexer, ID);
   begin
      --  regexp lexer only used in unit tests
      raise SAL.Not_Implemented;
      return False;
   end Can_Contain_New_Line;

   overriding
   function Terminated_By_New_Line
     (Lexer : in Instance;
      ID    : in Token_ID)
     return Boolean
   is
      pragma Unreferenced (Lexer, ID);
   begin
      --  regexp lexer only used in unit tests
      raise SAL.Not_Implemented;
      return False;
   end Terminated_By_New_Line;

end WisiToken.Lexer.Regexp;