struct

 
 (** The result of a matching of a regular expression with a string may be:

None which means that the matching have failed

Some(a,x,gl,b) which means that:

  • there exists a substring x of the input string that matches the regular expression;
  • the integers a and b (the frame) are the positions (indexes) of the beginning and the end of the substring x w.r.t. the input string;
  • the value gl is the list of substrings which have matched the groups defined in the regular expression; the length of this list will be equal to the number of groups defined in the regular expression.
Example:

# let r = mkregexp ["("]  ["[0-9]*"; "[,]?"; "[0-9]*"]  [")"] ;;

# match_whole r "abcd";;
  : result = None

# match_whole r "(16,7)";;
  : result = Some (0, "(16,7)", ["16"; ","; "7"], 5)

*)

 type result = (int * string * (string list) * int) option
 ;;

(**

Building

*)


(** Facility for building regular expressions. The call mkregexp pl gl sl causes the following actions:
  • the strings in pl are simply catenated in a unique string (the prefix)
  • the strings in gl are catenated enclosing each one into "\\(" and "\\)" in order to define distinct groups
  • the strings in sl are simply catenated in a unique string (the suffix)
  • the result is the compiled regexp of the catenation of prefix, groups and suffix. The optional boolean parameter strict, set to true by default, means that the regular expression will be matched exactly. In other words, the string obtained as described above, by default is finally enclosed into "^" and "$".
*)

let mkregexp ?(strict:bool=true) pl gl sl : Str.regexp = 

   let prefix = String.big (^) pl in
   let groups = String.big (^) (List.map (fun x->("\\(" ^ x ^ "\\)")) gl) in
   let suffix = String.big (^) sl in
   let expr = prefix ^ groups ^ suffix in
   let expr = if strict then ("^" ^ expr ^ "$"else expr in
   Str.regexp expr
;;

(**

Matching

*)


(** The call matched_groups i x returns the list of substrings of x matching groups starting from the group number i. See the standard Str.matched_group for more details. *)

let rec matched_groups i x : (string list) = 
 try
    let g=(Str.matched_group i x) in g::(matched_groups (i+1) x)
 with _ -> []
;;

(** The heuristic match_frame r s (a,b) try to match the substring (a,b) of the string s with the compiled regular expression r. *)

 let match_frame (r:Str.regexp) (s:string) (a,b) : result =
  try begin 
  let s  = String.sub s a (b-a+1)    in  
  let i  = Str.search_forward r s 0 in
  let y  = Str.matched_string s      in
  let j  = (Str.match_end ())-1      in  Some (a+i,y,(matched_groups 1 s),a+j) 
  end with Not_found -> None
;;


(** The heuristic match_whole r s (a,b) try to match the whole string s with the compiled regular expression r. *)

let match_whole (r:Str.regexp) (s:string) : result = 
 try
 let a  = Str.search_forward r s 0 in 
 let y  = Str.matched_string s      in
 let b  = (Str.match_end ())-1      in  Some (a,y,(matched_groups 1 s),b)
 with Not_found -> None
;;

(** Similar to match_whole but the regular expression is given as a simple string and compiled on the fly before invoking match_whole. In other words, match_string e s is simpy a shortcut for match_whole (Str.regexp e) s. *)

let match_string (expr:string) (s:string) : result = 
 match_whole (Str.regexp expr) s
;;

(** Extract parts of a string using a regexp containing some group expressions \((..\)). If the input string does not match, the empty list is returned. Example:
# extract_groups (Str.regexp "aa\\([0-9]*\\)bb\\([A-Z]*\\)cc") "aa12bbZcc";;
  : string list = ["12"; "Z"]
*)

let extract_groups (r:Str.regexp) (s:string) : string list = 
   match match_whole r s with
   | None -> raise Not_found
(* To do: It was "[]". This may break things, and should be considered experimental --L. *)
   | Some (a,s,g,b) -> g
   ;; 


(**

Boolean versions

*)


(** Boolean versions of matching heuristics (true stands for <>None). *)

module Bool = struct 

  
  (** Boolean version of the heuristic match_frame. *)

  let match_frame  (r:Str.regexp) (s:string) (a,b) : bool = ((match_frame r s (a,b)) <> None)

  
  (** Boolean version of the heuristic match_whole. *)

  let match_whole  (r:Str.regexp) (s:string)       : bool = ((match_whole r s) <> None)

  
  (** Boolean version of the heuristic match_string. *)

  and match_string (e:string)     (s:string)       : bool = ((match_whole (Str.regexp e) s) <> None

end;; (* module Bool *)


(**

Stuff

*)



(** minus x y delete the rightmost occurrence of the pattern y into the string x.

Examples:

# minus "foo.bar.txt" "[.][a-z]*";;
  : string = "foo.bar"

# minus "/usr/local/bin" "[/][a-z]*";;
  : string = "/usr/local"

*)

let minus x y = 
 let pattern=("\\(.*\\)"^y^"\\(.*\\)"in 
 match (extract_groups (Str.regexp pattern) x) with 
 | [] -> x 
 | l  -> String.big (^) l 
;; 

(** Grep on string lists: only strings matching the pattern are selected.

Examples:

# grep "[0-9]" ["aa";"bb";"c8";"dd";"1e"]  ;;
  : string list = ["c8"; "1e"] 

# grep "[0-9]$" ["aa";"bb";"c8";"dd";"1e"]  ;;
  : string list = ["c8"]

# "ls" => ( Sys.run || fst || String.to_list || grep ".*mli$" ) ;; 
  : string list = ["foo.mli"; "bar.mli"] 
*)

let grep (e:string) (sl:string list) : string list = 
 let r = Str.regexp e in
 List.filter (Bool.match_whole r) sl
;;


(** Check if a string can be used as an identifier. *)

let wellFormedName ?(allow_dash=false) =
  if allow_dash then
    Bool.match_string "^[a-zA-Z][a-zA-Z0-9_\\-]*$"
  else
    Bool.match_string "^[a-zA-Z][a-zA-Z0-9_]*$";;

end