Last active
June 20, 2025 00:45
-
-
Save dbuenzli/e395cad463b41db96c74b94af79c81c6 to your computer and use it in GitHub Desktop.
Two way string search
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
_b0 | |
_build |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# Usage: | |
# ./bench.sh [--search-first] [--worst-case] | |
# Build | |
rm -rf _build | |
mkdir _build | |
cp test.ml _build/ | |
ocamlopt -o _build/test _build/test.ml | |
# Bench | |
declare -a sizes=(1 2 4 8 16 64 128 258) | |
for size in "${sizes[@]}" | |
do | |
hyperfine \ | |
--parameter-list func \ | |
--use-naive,--use-two-way \ | |
"_build/test --bench $size $* {func}" | |
done |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
(*--------------------------------------------------------------------------- | |
Copyright (c) 2025 Daniel C. Bünzli. All rights reserved. | |
SPDX-License-Identifier: ISC | |
---------------------------------------------------------------------------*) | |
(** {1:design_notes Design notes} | |
{ul | |
{- We implement string search with the | |
{{:https://doi.org/10.1145/116825.116845}two way algorithm}. It | |
has a good worse case complexity and the result of | |
pre-preprocessing is only two integers (versus allocating tables | |
for Knuth-Morris-Pratt or Boyer-Moore).} | |
{- We raise on invalid position, we could also return [None] | |
or make the functions behave like the identity (if applicable). But | |
this is more in the spirit of the current [String] module and it's | |
also unclear which one is best.} | |
{- In `rfind_sub`, the [start] argument has a different semantics from | |
the argument [r*_from] functions. The advantage of the definition | |
used below is that it coincides with the mental model of indices | |
which is the one you want to work with.}} | |
{- The [start] argument remains however a position (vs index) | |
so that definitions on empty arguments are consistent and well | |
defined without fuss.}} *) | |
(** String search and replace functions. *) | |
module type API = sig | |
val includes : affix:string -> string -> bool | |
(** [includes ~affix s] is [true] if and only if [s] has [affix] as | |
a substring. | |
@since X.XX *) | |
val find_sub : ?start:int -> sub:string -> string -> int option | |
(** [find_sub ~start ~sub s] is the start position (if any) of the | |
first occurence of [sub] in [s] after or at position [start] | |
(which includes index [start] if it exists, defaults to [0]). | |
Note if you need to search for [sub] multiple times in [s] use | |
{!find_all_sub} it is more efficient. | |
@raise Invalid_argument if [start] is not a valid position of [s]. | |
@since X.XX *) | |
val rfind_sub : ?start:int -> sub:string -> string -> int option | |
(** [rfind_sub ~start ~sub s] is the start position (if any) of the | |
first occurences of [sub] in [s] before or at position [start] | |
(which includes index [start] if it exists, defaults to | |
[String.length s]). | |
Note if you need to search for [sub] multiple times in [s] use | |
{!rfind_all_sub} it is more efficient. | |
@raise Invalid_argument if [start] is not a valid position of [s]. | |
@since X.XX *) | |
val find_all_sub : | |
?start:int -> (int -> 'acc -> 'acc) -> sub:string -> string -> 'acc -> 'acc | |
(** [find_all_sub ~start f ~sub s acc], starting with [acc], folds [f] over | |
all non-overlapping starting positions of [sub] in [s] after or at | |
position [start] (which includes index [start] if it exists, defaults | |
to [0]). This is [acc] if [sub] could not be found in [s]. | |
@raise Invalid_argument if [start] is not a valid position of [s]. | |
@since X.XX *) | |
val rfind_all_sub : | |
?start:int -> (int -> 'acc -> 'acc) -> sub:string -> string -> 'acc -> 'acc | |
(** [rfind_all_sub ~start f ~sub s acc], starting with [acc], folds | |
[f] over all non-overlapping starting positions of [sub] in [s] | |
before or at position [start] (which includes index [start] if | |
it exists, defaults to [String.length s]). This is [acc] if | |
[sub] could not be found in [s]. | |
@raise Invalid_argument if [start] is not a valid position of [s]. | |
@since X.XX *) | |
val replace_first : ?start:int -> sub:string -> by:string -> string -> string | |
(** [replace_first ~start ~sub ~by s] replaces in [s] the first occurence | |
of [sub] at or after position [start] (defaults to [0]) by [by]. | |
@raise Invalid_argument if [start] is not a valid position of [s]. | |
@since X.XX *) | |
val replace_all : ?start:int -> sub:string -> by:string -> string -> string | |
(** [replace_all ~start ~sub ~by] replaces in [s] all | |
non-overlapping occurences of [sub] at or after position [start] | |
(default to [0]) by [by]. | |
@raise Invalid_argument if [start] is not a valid position of [s]. | |
@since X.XX *) | |
end | |
open String (* Make it easier to prepare a PR against upstream *) | |
let invalid_start ~start len = | |
let i = Int.to_string in | |
invalid_arg @@ concat "" ["start: "; i start; "not in range [0;"; i len; "]"] | |
(** Implement API with naive string search *) | |
module Naive = struct | |
let is_sub ~sub s j = | |
let sublen = length sub in | |
let i = ref 0 in | |
while !i < sublen && Char.equal (get s (j + !i)) (get sub !i) | |
do incr i done; | |
!i = sublen | |
let primitive_find_sub ~start ~sub s = | |
let slen = length s in | |
if not (0 <= start && start <= slen) then invalid_start ~start slen else | |
let smax = slen - length sub in | |
let j = ref start in | |
while !j <= smax && not (is_sub ~sub s !j) do incr j done; | |
if !j <= smax then !j else -1 | |
let primitive_rfind_sub ~start ~sub s = | |
let slen = length s in | |
if not (0 <= start && start <= slen) then invalid_start ~start slen else | |
let smax = length s - length sub in | |
let j = ref (if start > smax then smax else start) in | |
while !j >= 0 && not (is_sub ~sub s !j) do decr j done; | |
if !j >= 0 then !j else -1 | |
let includes ~affix s = primitive_find_sub ~start:0 ~sub:affix s <> -1 | |
let find_sub ?(start = 0) ~sub s = | |
match primitive_find_sub ~start ~sub s with -1 -> None | i -> Some i | |
let rfind_sub ?start ~sub s = | |
let start = match start with None -> length s | Some s -> s in | |
match primitive_rfind_sub ~start ~sub s with -1 -> None | i -> Some i | |
let find_all_sub ?(start = 0) f ~sub s acc = | |
let rec loop f acc sub s ~start ~slen = | |
if start > slen then acc else | |
match primitive_find_sub ~start ~sub s with | |
| -1 -> acc | |
| i -> | |
let acc = f i acc in | |
let start = i + Int.max (length sub) 1 in | |
loop f acc sub s ~start ~slen | |
in | |
let slen = length s in | |
if not (0 <= start && start <= slen) then invalid_start ~start slen else | |
loop f acc sub s ~start ~slen | |
let rfind_all_sub ?start f ~sub s acc = | |
let rec loop f acc sub s ~start ~slen = | |
if start < 0 then acc else | |
match primitive_rfind_sub ~start ~sub s with | |
| -1 -> acc | |
| i -> | |
let start = i - Int.max (length sub) 1 in | |
loop f (f i acc) sub s ~start ~slen | |
in | |
let slen = length s in | |
let start = match start with None -> length s | Some s -> s in | |
if not (0 <= start && start <= slen) then invalid_start ~start slen else | |
loop f acc sub s ~start ~slen | |
let replace_first ?(start = 0) ~sub:needle ~by s = | |
match primitive_find_sub ~start ~sub:needle s with | |
| -1 -> s | |
| i -> | |
let rest_first = i + length needle in | |
let rest_len = length s - i - length needle in | |
concat by [sub s 0 i; sub s rest_first rest_len] | |
let replace_all ?start ~sub:needle ~by s = | |
let chunk_first = ref 0 in | |
let add_chunk i acc = | |
let last_chunk = sub s !chunk_first (i - !chunk_first) in | |
chunk_first := i + length needle; last_chunk :: acc | |
in | |
match find_all_sub ?start add_chunk ~sub:needle s [] with | |
| [] -> s | |
| chunks -> | |
let last_chunk = sub s !chunk_first (length s - !chunk_first) in | |
concat by (List.rev (last_chunk :: chunks)) | |
end | |
(** Implement API with two way string search *) | |
module Two_way = struct | |
(* See https://doi.org/10.1145/116825.116845 or | |
http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 *) | |
let find_maximal_suffix_and_period ~sub = | |
let sublen = length sub in | |
let i = ref (-1) and j = ref 0 and k = ref 1 and p = ref 1 in | |
let[@inline] maximal_suffix ~order = | |
while (!j + !k < sublen) do | |
let c = order * Char.compare (get sub (!j + !k)) (get sub (!i + !k)) in | |
if c < 0 then (j := !j + !k; k := 1; p := !j - !i) else | |
if c > 0 then (i := !j; j := !i + 1; k := 1; p := 1) else (* c = 0 *) | |
if !k = !p then (j := !j + !p; k := 1) else incr k | |
done; | |
in | |
(maximal_suffix[@inlined]) ~order:1; | |
let l0 = !i and p0 = !p in | |
i := -1; j := 0; k := 1; p := 1; | |
(maximal_suffix[@inlined]) ~order:(-1); | |
let l1 = !i and p1 = !p in | |
if l0 > l1 then (l0, p0) else (l1, p1) | |
let periodic_sub ~sub ~sub_lp:(l, p) = | |
let i = ref 0 in | |
while !i <= l && Char.equal (get sub !i) (get sub (!i + p)) | |
do incr i done; | |
!i > l | |
let primitive_find_sub ~start ~sub ~sub_lp:(l, p as sub_lp) s = | |
let slen = length s in | |
if not (0 <= start && start <= slen) then invalid_start ~start slen else | |
let sublen = length sub in | |
let smax = slen - sublen in | |
let j = ref start in | |
try | |
if periodic_sub ~sub ~sub_lp then begin | |
let memory = ref (-1) in | |
while (!j <= smax) do | |
let i = ref (1 + Int.max l !memory) in | |
while (!i < sublen && Char.equal (get sub !i) (get s (!i + !j))) | |
do incr i done; | |
if !i < sublen then (j := !j + (!i - l); memory := -1) else | |
begin | |
i := l; | |
while (!i > !memory && Char.equal (get sub !i) (get s (!i + !j))) | |
do decr i done; | |
if !i <= !memory then raise_notrace Exit else | |
(j := !j + p; memory := sublen - p - 1) | |
end | |
done; | |
-1 | |
end else begin | |
let p = 1 + Int.max (l + 1) (sublen - l - 1) in | |
while (!j <= smax) do | |
let i = ref (l + 1) in | |
while (!i < sublen && Char.equal (get sub !i) (get s (!i + !j))) | |
do incr i done; | |
if !i < sublen then (j := !j + (!i - l)) else | |
begin | |
i := l; | |
while (!i >= 0 && Char.equal (get sub !i) (get s (!i + !j))) | |
do decr i done; | |
if !i < 0 then raise_notrace Exit else (j := !j + p) | |
end | |
done; | |
-1 | |
end | |
with Exit -> !j | |
let primitive_rfind_sub ~start ~sub ~sub_lp:(l, p as sub_lp) s = | |
(* Note this is the same as above except for the assignement | |
and test logic on [j] where we move from right to left. *) | |
let slen = length s in | |
if not (0 <= start && start <= slen) then invalid_start ~start slen else | |
let sublen = length sub in | |
let smax = slen - sublen in | |
let j = ref (if start > smax then smax else start) in | |
try | |
if periodic_sub ~sub ~sub_lp then begin | |
let memory = ref (-1) in | |
while (!j >= 0) do | |
let i = ref (1 + Int.max l !memory) in | |
while (!i < sublen && Char.equal (get sub !i) (get s (!i + !j))) | |
do incr i done; | |
if !i < sublen then (j := !j - (!i - l); memory := -1) else | |
begin | |
i := l; | |
while (!i > !memory && Char.equal (get sub !i) (get s (!i + !j))) | |
do decr i done; | |
if !i <= !memory then raise_notrace Exit else | |
(j := !j - p; memory := sublen - p - 1) | |
end | |
done; | |
-1 | |
end else begin | |
let p = 1 + Int.max (l + 1) (sublen - l - 1) in | |
while (!j >= 0) do | |
let i = ref (l + 1) in | |
while (!i < sublen && Char.equal (get sub !i) (get s (!i + !j))) | |
do incr i done; | |
if !i < sublen then (j := !j - (!i - l)) else | |
begin | |
i := l; | |
while (!i >= 0 && Char.equal (get sub !i) (get s (!i + !j))) | |
do decr i done; | |
if !i < 0 then raise_notrace Exit else (j := !j - p) | |
end | |
done; | |
-1 | |
end | |
with Exit -> !j | |
let includes ~affix:sub s = | |
let sub_lp = find_maximal_suffix_and_period ~sub in | |
primitive_find_sub ~start:0 ~sub ~sub_lp s <> -1 | |
let find_sub ?(start = 0) ~sub s = | |
let sub_lp = find_maximal_suffix_and_period ~sub in | |
match primitive_find_sub ~start ~sub_lp ~sub s with | |
| -1 -> None | i -> Some i | |
let rfind_sub ?start ~sub s = | |
let start = match start with None -> length s | Some s -> s in | |
let sub_lp = find_maximal_suffix_and_period ~sub in | |
match primitive_rfind_sub ~start ~sub_lp ~sub s with | |
| -1 -> None | i -> Some i | |
let find_all_sub ?(start = 0) f ~sub s acc = | |
let rec loop f acc sub sub_lp s ~start ~slen = | |
if start > slen then acc else | |
match primitive_find_sub ~start ~sub ~sub_lp s with | |
| -1 -> acc | |
| i -> | |
let acc = f i acc in | |
let start = i + length sub in | |
let start = if start = i then start + 1 else start in | |
loop f acc sub sub_lp s ~start ~slen | |
in | |
let slen = length s in | |
if not (0 <= start && start <= slen) then invalid_start ~start slen else | |
let sub_lp = find_maximal_suffix_and_period ~sub in | |
loop f acc sub sub_lp s ~start ~slen | |
let rfind_all_sub ?start f ~sub s acc = | |
let rec loop f acc sub sub_lp s ~start ~slen = | |
if start < 0 then acc else | |
match primitive_rfind_sub ~start ~sub ~sub_lp s with | |
| -1 -> acc | |
| i -> | |
let start = i - Int.max (length sub) 1 in | |
loop f (f i acc) sub sub_lp s ~start ~slen | |
in | |
let slen = length s in | |
let start = match start with None -> length s | Some s -> s in | |
if not (0 <= start && start <= slen) then invalid_start ~start slen else | |
let sub_lp = find_maximal_suffix_and_period ~sub in | |
loop f acc sub sub_lp s ~start ~slen | |
let replace_first ?(start = 0) ~sub:needle ~by s = | |
let sub_lp = find_maximal_suffix_and_period ~sub:needle in | |
match primitive_find_sub ~start ~sub:needle ~sub_lp s with | |
| -1 -> s | |
| i -> | |
let rest_first = i + length needle in | |
let rest_len = length s - i - length needle in | |
concat by [sub s 0 i; sub s rest_first rest_len] | |
let replace_all ?start ~sub:needle ~by s = | |
let chunk_first = ref 0 in | |
let add_chunk i acc = | |
let acc = sub s !chunk_first (i - !chunk_first) :: acc in | |
chunk_first := i + length needle; acc | |
in | |
match find_all_sub ?start add_chunk ~sub:needle s [] with | |
| [] -> s | |
| chunks -> | |
let chunks = sub s !chunk_first (length s - !chunk_first) :: chunks in | |
concat by (List.rev chunks) | |
end | |
(* Tests *) | |
let is_invalid_arg f = try (f (); assert false) with Invalid_argument _ -> () | |
let test_find_sub (module String : API) = | |
is_invalid_arg (fun () -> String.find_sub ~start:(-1) ~sub:"" ""); | |
assert (String.find_sub ~start:0 ~sub:"" "" = Some 0); | |
is_invalid_arg (fun () -> String.find_sub ~start:1 ~sub:"" ""); | |
assert (String.find_sub ~start:0 ~sub:"" "ab" = Some 0); | |
assert (String.find_sub ~start:1 ~sub:"" "ab" = Some 1); | |
assert (String.find_sub ~start:2 ~sub:"" "ab" = Some 2); | |
is_invalid_arg (fun () -> String.find_sub ~start:3 ~sub:"" "ab"); | |
assert (String.find_sub ~start:0 ~sub:"a" "" = None); | |
assert (String.find_sub ~start:0 ~sub:"a" "a" = Some 0); | |
assert (String.find_sub ~start:1 ~sub:"a" "a" = None); | |
assert (String.find_sub ~start:0 ~sub:"a" "ba" = Some 1); | |
assert (String.find_sub ~start:1 ~sub:"a" "ba" = Some 1); | |
assert (String.find_sub ~start:2 ~sub:"a" "ba" = None); | |
assert (String.find_sub ~start:0 ~sub:"ab" "" = None); | |
assert (String.find_sub ~start:0 ~sub:"ab" "ab" = Some 0); | |
assert (String.find_sub ~start:0 ~sub:"ab" "aab" = Some 1); | |
assert (String.find_sub ~start:1 ~sub:"ab" "aab" = Some 1); | |
assert (String.find_sub ~start:2 ~sub:"ab" "aab" = None); | |
assert (String.find_sub ~start:3 ~sub:"ab" "aab" = None); | |
is_invalid_arg (fun () -> String.find_sub ~start:(-1) ~sub:"abaa" "aba"); | |
assert (String.find_sub ~start:0 ~sub:"abaa" "aba" = None); | |
assert (String.find_sub ~start:2 ~sub:"abaa" "aba" = None); | |
assert (String.find_sub ~start:3 ~sub:"abaa" "aba" = None); | |
is_invalid_arg (fun () -> String.find_sub ~start:4 ~sub:"abaa" "aba"); | |
assert (String.find_sub ~start:0 ~sub:"aba" "ababa" = Some 0); | |
assert (String.find_sub ~start:1 ~sub:"aba" "ababa" = Some 2); | |
assert (String.find_sub ~start:2 ~sub:"aba" "ababa" = Some 2); | |
assert (String.find_sub ~start:3 ~sub:"aba" "ababa" = None); | |
assert (String.find_sub ~start:4 ~sub:"aba" "ababa" = None); | |
assert (String.find_sub ~start:5 ~sub:"aba" "ababa" = None); | |
() | |
let test_rfind_sub (module String : API) = | |
is_invalid_arg (fun () -> String.rfind_sub ~start:(-1) ~sub:"" ""); | |
assert (String.rfind_sub ~start:0 ~sub:"" "" = Some 0); | |
is_invalid_arg (fun () -> String.rfind_sub ~start:1 ~sub:"" ""); | |
assert (String.rfind_sub ~start:0 ~sub:"" "ab" = Some 0); | |
assert (String.rfind_sub ~start:1 ~sub:"" "ab" = Some 1); | |
assert (String.rfind_sub ~start:2 ~sub:"" "ab" = Some 2); | |
is_invalid_arg (fun () -> String.rfind_sub ~start:3 ~sub:"" "ab"); | |
assert (String.rfind_sub ~start:0 ~sub:"a" "" = None); | |
assert (String.rfind_sub ~start:0 ~sub:"a" "a" = Some 0); | |
assert (String.rfind_sub ~start:1 ~sub:"a" "a" = Some 0); | |
assert (String.rfind_sub ~start:0 ~sub:"a" "ba" = None); | |
assert (String.rfind_sub ~start:1 ~sub:"a" "ba" = Some 1); | |
assert (String.rfind_sub ~start:2 ~sub:"a" "ba" = Some 1); | |
assert (String.rfind_sub ~start:0 ~sub:"ab" "" = None); | |
assert (String.rfind_sub ~start:0 ~sub:"ab" "ab" = Some 0); | |
assert (String.rfind_sub ~start:0 ~sub:"ab" "aab" = None); | |
assert (String.rfind_sub ~start:1 ~sub:"ab" "aab" = Some 1); | |
assert (String.rfind_sub ~start:2 ~sub:"ab" "aab" = Some 1); | |
assert (String.rfind_sub ~start:3 ~sub:"ab" "aab" = Some 1); | |
is_invalid_arg (fun () -> String.rfind_sub ~start:(-1) ~sub:"abaa" "aba"); | |
assert (String.rfind_sub ~start:0 ~sub:"abaa" "aba" = None); | |
assert (String.rfind_sub ~start:2 ~sub:"abaa" "aba" = None); | |
assert (String.rfind_sub ~start:3 ~sub:"abaa" "aba" = None); | |
is_invalid_arg (fun () -> String.rfind_sub ~start:4 ~sub:"abaa" "aba"); | |
assert (String.rfind_sub ~start:0 ~sub:"aba" "ababa" = Some 0); | |
assert (String.rfind_sub ~start:1 ~sub:"aba" "ababa" = Some 0); | |
assert (String.rfind_sub ~start:2 ~sub:"aba" "ababa" = Some 2); | |
assert (String.rfind_sub ~start:3 ~sub:"aba" "ababa" = Some 2); | |
assert (String.rfind_sub ~start:4 ~sub:"aba" "ababa" = Some 2); | |
assert (String.rfind_sub ~start:5 ~sub:"aba" "ababa" = Some 2); | |
() | |
let test_find_all_sub (module String : API) = | |
let test ~sub s occs = | |
assert (List.rev (String.find_all_sub List.cons ~sub s []) = occs) | |
in | |
test ~sub:"" "" [0]; | |
test ~sub:"" "ab" [0;1;2]; | |
test ~sub:"a" "" []; | |
test ~sub:"a" "a" [0]; | |
test ~sub:"a" "ba" [1]; | |
test ~sub:"ab" "" []; | |
test ~sub:"ab" "ab" [0]; | |
test ~sub:"ab" "aab" [1]; | |
test ~sub:"abaa" "aba" []; | |
test ~sub:"abaa" "aba" []; | |
test ~sub:"aba" "ababa" [0]; | |
test ~sub:"aba" "babababab" [1;5]; | |
() | |
let test_rfind_all_sub (module String : API) = | |
let test ~sub s occs = | |
assert (List.rev (String.rfind_all_sub List.cons ~sub s []) = occs) | |
in | |
test ~sub:"" "" [0]; | |
test ~sub:"" "ab" [2;1;0]; | |
test ~sub:"a" "" []; | |
test ~sub:"a" "a" [0]; | |
test ~sub:"a" "ba" [1]; | |
test ~sub:"ab" "" []; | |
test ~sub:"ab" "ab" [0]; | |
test ~sub:"ab" "aab" [1]; | |
test ~sub:"abaa" "aba" []; | |
test ~sub:"abaa" "aba" []; | |
test ~sub:"aba" "ababa" [2]; | |
test ~sub:"aba" "babababab" [5;1]; | |
() | |
let test_includes (module String : API) = | |
assert (String.includes ~affix:"" "" = true); | |
assert (String.includes ~affix:"" "a" = true); | |
assert (String.includes ~affix:"" "ab" = true); | |
assert (String.includes ~affix:"a" "" = false); | |
assert (String.includes ~affix:"a" "a" = true); | |
assert (String.includes ~affix:"a" "ab" = true); | |
assert (String.includes ~affix:"a" "ba" = true); | |
assert (String.includes ~affix:"a" "bab" = true); | |
assert (String.includes ~affix:"ab" "" = false); | |
assert (String.includes ~affix:"ab" "a" = false); | |
assert (String.includes ~affix:"ab" "ab" = true); | |
assert (String.includes ~affix:"ab" "aab" = true); | |
assert (String.includes ~affix:"aab" "ab" = false); | |
assert (String.includes ~affix:"ab" "aba" = true); | |
assert (String.includes ~affix:"ab" "aaba" = true); | |
() | |
let test_replace_first (module String : API) = | |
assert (String.replace_first ~sub:"" ~by:"" "" = ""); | |
assert (String.replace_first ~sub:"" ~by:"a" "" = "a"); | |
assert (String.replace_first ~sub:"" ~by:"a" "123" = "a123"); | |
assert (String.replace_first ~start:1 ~sub:"" ~by:"a" "123" = "1a23"); | |
assert (String.replace_first ~start:2 ~sub:"" ~by:"a" "123" = "12a3"); | |
assert (String.replace_first ~start:3 ~sub:"" ~by:"a" "123" = "123a"); | |
assert (String.replace_first ~sub:"1" ~by:"" "123" = "23"); | |
assert (String.replace_first ~sub:"3" ~by:"" "123" = "12"); | |
assert (String.replace_first ~sub:"1" ~by:"" "1" = ""); | |
assert (String.replace_first ~sub:"12" ~by:"z" "123" = "z3"); | |
assert (String.replace_first ~start:2 ~sub:"" ~by:"z" "123" = "12z3"); | |
assert (String.replace_first ~start:3 ~sub:"" ~by:"z" "123" = "123z"); | |
assert (String.replace_first ~start:3 ~sub:"a" ~by:"z" "123" = "123"); | |
() | |
let test_replace_all (module String : API) = | |
assert (String.replace_all ~sub:"" ~by:"" "" = ""); | |
assert (String.replace_all ~sub:"" ~by:"" "1" = "1"); | |
assert (String.replace_all ~sub:"" ~by:"" "12" = "12"); | |
assert (String.replace_all ~sub:"" ~by:"a" "" = "a"); | |
assert (String.replace_all ~sub:"" ~by:"a" "1" = "a1a"); | |
assert (String.replace_all ~sub:"" ~by:"a" "12" = "a1a2a"); | |
assert (String.replace_all ~sub:"" ~by:"a" "123" = "a1a2a3a"); | |
assert (String.replace_all ~start:0 ~sub:"" ~by:"a" "123" = "a1a2a3a"); | |
assert (String.replace_all ~start:1 ~sub:"" ~by:"a" "123" = "1a2a3a"); | |
assert (String.replace_all ~start:2 ~sub:"" ~by:"a" "123" = "12a3a"); | |
assert (String.replace_all ~start:3 ~sub:"" ~by:"a" "123" = "123a"); | |
assert (String.replace_all ~sub:"1" ~by:"" "121" = "2"); | |
assert (String.replace_all ~sub:"1" ~by:"3" "121" = "323"); | |
assert (String.replace_all ~sub:"1" ~by:"" "1" = ""); | |
assert (String.replace_all ~sub:"12" ~by:"a" "123" = "a3"); | |
assert (String.replace_all ~sub:"12" ~by:"a" "123112" = "a31a"); | |
assert (String.replace_all ~start:1 ~sub:"12" ~by:"a" "123112" = "1231a"); | |
() | |
let test api = | |
test_find_sub api; | |
test_rfind_sub api; | |
test_find_all_sub api; | |
test_rfind_all_sub api; | |
test_includes api; | |
test_replace_first api; | |
test_replace_all api; | |
Printf.eprintf "All test passed.\n"; | |
() | |
(* Benchmarking *) | |
let random_ab_string n = String.init n (fun _ -> Char.chr (0x61 + Random.int 2)) | |
let worst_ab_string n = (* a^(n-1)b *) | |
String.init n (fun i -> if i = n - 1 then 'b' else 'a') | |
let benchmark (module String : API) ~search_first ~worst_case needle_size = | |
let () = Random.init 45 in | |
let haystack_size = 4 * 1024 in | |
for i = 1 to 1024 do | |
let needle, haystack = | |
if worst_case | |
then worst_ab_string needle_size, Stdlib.String.make haystack_size 'a' | |
else random_ab_string needle_size, random_ab_string haystack_size | |
in | |
let by = "" in | |
Sys.opaque_identity @@ | |
if search_first | |
then (ignore (String.replace_first ~sub:needle ~by haystack)) | |
else (ignore (String.replace_all ~sub:needle ~by haystack)) | |
done | |
let run bench impl ~search_first ~worst_case = | |
let api = match impl with | |
| `Naive -> (module Naive : API) | |
| `Two_way -> (module Two_way : API) | |
in | |
match bench with | |
| None -> test api; 0 | |
| Some needle_size -> benchmark api ~search_first ~worst_case needle_size; 0 | |
(* Main *) | |
let main () = | |
let bench = ref None in | |
let impl = ref `Two_way in | |
let search_first = ref false in | |
let worst_case = ref false in | |
let args = | |
[ "--bench", Arg.Int (fun needle_size -> bench := Some needle_size), | |
"<n> Test peformance for searching with a needle of given needle size"; | |
"--use-naive", Arg.Unit (fun () -> impl := `Naive), | |
"Use naive search"; | |
"--use-two-way", Arg.Unit (fun () -> impl := `Two_way), | |
"Use two way search"; | |
"--search-first", Arg.Set search_first, | |
"Only search first occurence"; | |
"--worst-case", Arg.Set worst_case, | |
"Test on worst case string search"; ] | |
in | |
Arg.parse args (fun v -> raise (Arg.Bad "Unknown positional argument")) ""; | |
run !bench !impl ~search_first:!search_first ~worst_case:!worst_case | |
let () = if !Sys.interactive then () else exit (main ()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment