------------------------------------------------------------------------------
-- G N A T C O L L --
-- --
-- Copyright (C) 2001-2017, AdaCore --
-- --
-- This library is free software; you can redistribute it and/or modify it --
-- under terms of the GNU General Public License as published by the Free --
-- Software Foundation; either version 3, or (at your option) any later --
-- version. This library is distributed in the hope that it will be useful, --
-- but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHAN- --
-- TABILITY or FITNESS FOR A PARTICULAR PURPOSE. --
-- --
-- As a special exception under Section 7 of GPL version 3, you are granted --
-- additional permissions described in the GCC Runtime Library Exception, --
-- version 3.1, as published by the Free Software Foundation. --
-- --
-- You should have received a copy of the GNU General Public License and --
-- a copy of the GCC Runtime Library Exception along with this program; --
-- see the files COPYING3 and COPYING.RUNTIME respectively. If not, see --
-- . --
-- --
------------------------------------------------------------------------------
-- The search is done right-to-left. In the best cases (the text doesn't
-- contain any character from the pattern), this results in
-- string_length / pattern_length, characters being examined, instead of
-- string_length characters.
--
-- Compiling the pattern generates two lookup-tables:
--
-- The Last Occurrence Function
-- ============================
--
-- The Last-Ocurrence-Function returns the right-most location for each
-- character in the alphabet in the pattern.
-- When a character is seen in the searched string, this array will suggest
-- the offset by which we should move the character:
-- string: "revolution in the treatment of"
-- pattern: " reminiscence"
-- ^ when we see the 'h', we can move to:
-- " reminiscence"
--
-- string: "written notice that"
-- pattern: " reminiscence"
-- ^ when the see the 'i', we can move to:
-- " reminiscence"
--
-- string: "golden fleece of"
-- pattern: " reminiscence"
-- ^ when we see the 'e', no move can be suggested,
-- since 'e' appears at the right-most position
-- in the pattern.
--
-- The Good Suffix Function
-- ========================
--
-- This function reports the least amount that garantees that any pattern
-- characters that align with the good suffix previously found in the text
-- will match those suffix characters.
-- For instance:
--
-- string: "written notice that"
-- pattern: " reminiscence"
-- ^ The pattern would be moved so that the "ce"
-- vv we have already found match some text.
-- " reminiscence"
--
-- Combination
-- ===========
--
-- The two functions above can be computed statically based only on the
-- pattern, and without any knowledge of the text.
-- When we try to match a pattern with a text, these two functions are
-- combined, and the pattern is moved forward by the maximum amount reported
-- by the two functions.
with Unchecked_Deallocation;
with Ada.Text_IO; use Ada.Text_IO;
with Ada.Integer_Text_IO; use Ada.Integer_Text_IO;
with GNAT.Case_Util; use GNAT.Case_Util;
package body GNATCOLL.Boyer_Moore is
Debug : constant Boolean := False;
Debug_Run : constant Boolean := False;
procedure Dump_Str (Str : String);
-- Print string, replacing the newlines with spaces for clarity
procedure Dump
(M, Shift, J : Natural;
Num_Comp : in out Natural;
Motif : Pattern;
In_String : String);
-- Print the current state of the search.
-- The parameters are the internal state in Search. We do not use a
-- nested subprogram for efficiency reasons
-------------
-- Compile --
-------------
procedure Compile
(Motif : in out Pattern;
From_String : String;
Case_Sensitive : Boolean := True)
is
-- Prefix contains the following:
-- Prefix (J) is the length of the longest prefix of Motif
-- which is also a suffix of
-- Motif (Motif'First .. Motif'First + J - 1)
-- ie of the motif made of the j-th first characters of Motif
--
-- Reverse_Prefix is the Prefix function applied to the reverse of Motif
--
-- Motif.Last_Occurrence contains the index of the last occurrence of
-- the character in the motif. This is in the range 1 .. Motif'Length
--
-- Good_Suffix at index J:
-- If a mismatch occurs in the j-th character of the pattern, we
-- can safely advance by good_suffix (j).
-- m = Motif'Length
-- GS(J) = m
-- - Max (k; 0<=k 0);
Motif.Motif := new String (1 .. From_String'Length);
Motif.Motif.all := From_String;
if not Case_Sensitive then
To_Lower (Motif.Motif.all);
end if;
Prefix (Prefix'First) := 0;
Reverse_Prefix (Reverse_Prefix'First) := 0;
Motif.Last_Occurrence (Motif.Motif (1)) := 1;
for Q in 2 .. Motif.Motif'Last loop
-- Compute Last occurrence
Motif.Last_Occurrence (Motif.Motif (Q)) := Q;
-- Compute prefix function
while K > 0
and then Motif.Motif (K + 1) /= Motif.Motif (Q)
loop
K := Prefix (K);
end loop;
if Motif.Motif (K + 1) = Motif.Motif (Q) then
K := K + 1;
end if;
Prefix (Q) := K;
-- Compute the reverse prefix function
while K2 > 0
and then Motif.Motif (Motif.Motif'Last - K2) /=
Motif.Motif (Motif.Motif'Last + 1 - Q)
loop
K2 := Reverse_Prefix (K2);
end loop;
if Motif.Motif (Motif.Motif'Last - K2) =
Motif.Motif (Motif.Motif'Last + 1 - Q)
then
K2 := K2 + 1;
end if;
Reverse_Prefix (Q) := K2;
end loop;
-- Compute the good suffix function
K := From_String'Length - Prefix (From_String'Length);
Motif.Good_Suffix := new Offset_Array'(0 .. From_String'Length => K);
for L in Motif.Motif'Range loop
K := From_String'Length - Reverse_Prefix (L);
Tmp := L - Reverse_Prefix (L);
if Motif.Good_Suffix (K) > Tmp then
Motif.Good_Suffix (K) := Tmp;
end if;
end loop;
if Debug then
Put (" i = ");
for J in Motif.Motif'Range loop
Put (Item => J, Width => 3);
end loop;
New_Line;
Put (" Pat[i]= ");
for J in Motif.Motif'Range loop
Put (" " & Motif.Motif (J));
end loop;
New_Line;
Put (" Pre[i]= ");
for J in Prefix'Range loop
Put (Item => Prefix (J), Width => 3);
end loop;
New_Line;
Put ("RevPre[i]= ");
for J in Reverse_Prefix'Range loop
Put (Item => Reverse_Prefix (J), Width => 3);
end loop;
New_Line;
Put ("GoodSu[i]= ");
for J in Motif.Good_Suffix'Range loop
Put (Item => Motif.Good_Suffix (J), Width => 3);
end loop;
New_Line;
end if;
end Compile;
----------
-- Free --
----------
procedure Free (Motif : in out Pattern) is
procedure Internal is new Unchecked_Deallocation
(Offset_Array, Offset_Array_Access);
procedure Internal is new Unchecked_Deallocation (String, String_Access);
begin
Internal (Motif.Good_Suffix);
Internal (Motif.Motif);
end Free;
--------------
-- Dump_Str --
--------------
procedure Dump_Str (Str : String) is
begin
for S in Str'Range loop
if Str (S) = ASCII.LF then
Put (' ');
else
Put (Str (S));
end if;
end loop;
New_Line;
end Dump_Str;
----------
-- Dump --
----------
procedure Dump
(M, Shift, J : Natural;
Num_Comp : in out Natural;
Motif : Pattern;
In_String : String) is
begin
-- Show current automaton state
Num_Comp := Num_Comp + M - J + 1;
if Debug_Run then
New_Line;
Put_Line ("Offset : Shift+j="
& Integer'Image (Shift + J)
& " J=" & J'Img
& " Last_Occ=" & In_String (Shift + J)
& " Max ("
& Motif.Good_Suffix (J)'Img
& ","
& Integer'Image
(J - Motif.Last_Occurrence (In_String (Shift + J)))
& ")");
if In_String'Length < 400 then
Dump_Str (In_String);
Put ((1 .. Shift - In_String'First + 1 => ' '));
end if;
Dump_Str (Motif.Motif.all);
if Shift + J - In_String'First < 400 then
Put ((1 .. Shift + J - In_String'First => ' '));
Put_Line ("^");
end if;
end if;
if J = 0 then
Put_Line ("Matched at position" & Natural'Image (Shift + 1)
& " after" & Num_Comp'Img & " comparisons");
end if;
end Dump;
------------
-- Search --
------------
function Search (Motif : Pattern; In_String : String) return Integer is
M : Natural;
Shift : Natural := In_String'First - 1;
J : Natural;
Num_Comp : Natural := 0;
begin
if Motif.Motif = null then
return -1;
end if;
M := Motif.Motif'Length; -- length of pattern
pragma Assert (Motif.Motif'First = 1);
if not Motif.Case_Sensitive then
while Shift <= In_String'Last - M loop
J := M;
while J > 0
and then Motif.Motif (J) = To_Lower (In_String (Shift + J))
loop
J := J - 1;
end loop;
if J = 0 then
return Shift + 1;
elsif Debug then
Dump (M, Shift, J, Num_Comp, Motif, In_String);
end if;
Shift := Shift +
Natural'Max (Motif.Good_Suffix (J),
J - Motif.Last_Occurrence
(To_Lower (In_String (Shift + J))));
end loop;
else
while Shift <= In_String'Last - M loop
J := M;
while J > 0 and then Motif.Motif (J) = In_String (Shift + J) loop
J := J - 1;
end loop;
if J = 0 then
return Shift + 1;
elsif Debug then
Dump (M, Shift, J, Num_Comp, Motif, In_String);
end if;
Shift := Shift +
Natural'Max (Motif.Good_Suffix (J),
J - Motif.Last_Occurrence (In_String (Shift + J)));
end loop;
end if;
return -1;
end Search;
end GNATCOLL.Boyer_Moore;