module Utf16: sig
.. end
UTF-16 support for Ulex.
Implementation as described in "http://www.ietf.org/rfc/rfc2781.txt".
exception MalFormed
UTF-16 can be encoded in little endian format (0xabcd ->
(0xcd|0xab)) or big endian format (0xabcd -> (0xab|0xcd).
type
byte_order =
| |
Little_endian |
| |
Big_endian |
Interface
val to_int_array : byte_order option -> string -> int -> int -> int array
to_int_array opt_bo str spos bytes
decodes the string str
of
length bytes
starting in position spos
. If opt_bo
matches
with None
the functions tries to detect a BOM, if it can't it
assumes big endian byte order. If opt_bo
matches with Some bo
byte order bo
is assumed and potential byte order marks are
interpreted as code points 0xfeff.
val from_int_array : byte_order -> int array -> int -> int -> bool -> string
from_int_array bo a apos len bom
encodes an int array a
containing len
code points from position apos
into a string
with byte order bo
. The results starts with a BOM if bom =
true
.
val stream_from_char_stream : byte_order option -> char Stream.t -> int Stream.t
stream_from_char_stream opt_stro
creates a new int stream
containing the code points encoded in str
. Treats opt_bo
as
to_int_array
.
Low level
val get_byte_order : char -> char -> byte_order
get_byte_order c1 c2
determines the byte order by a pair of
bytes/characters c1
and c2
.
val from_stream : byte_order -> char Stream.t -> int
from_stream bo s
reads the next code point from a stream encoded
in byte order bo
.
val number_of_char_pair : byte_order -> char -> char -> int
number_of_char_pair bo c1 c2
returns the code point encoded in
c1
and c2
following byte order bo
.
val char_pair_of_number : byte_order -> int -> char * char
char_pair_of_number bo cp
encodes code point cp
into two
characters with byte order bo
.
val next_code : byte_order -> string -> int -> int -> int * int
next_code bo s pos bytes bo
reads the code point starting at
position pos
in a string s
of total length bytes
.
val compute_len : byte_order option -> string -> int -> int -> int
compute_len opt_bo str pos len
computes the
number of encoded code points in string str
from position
pos
to pos+len-1
.
val blit_to_int : byte_order option -> string -> int -> int array -> int -> int -> unit
blit_to_int bo str spos a apos n
decode len
bytes
from string str
starting at position spos
into
array a
, at position apos
.
val store : byte_order -> Buffer.t -> int -> unit
store bo buf cp
adds a codepoint cp
to a buffer buf
following the byte order bo
.
val from_utf16_stream : char Stream.t -> byte_order option -> Ulexing.lexbuf
from_utf16_stream s opt_bo
creates a lexbuf from an UTF-16
encoded stream. If opt_bo
matches with None
the function
expects a BOM (Byte Order Mark), and takes the byte order as
Utf16.Big_endian
if it cannot find one. When opt_bo
matches
with Some bo
, bo
is taken as byte order. In this case a
leading BOM is kept in the stream - the lexer has to ignore it
and a `wrong' BOM (0xfffe
) will raise Utf16.InvalidCodepoint.
val from_utf16_channel : Pervasives.in_channel -> byte_order option -> Ulexing.lexbuf
Works as from_utf16_stream
with an in_channel
.
val from_utf16_string : string -> byte_order option -> Ulexing.lexbuf
Works as from_utf16_stream
with a string
.
val utf16_lexeme : Ulexing.lexbuf -> byte_order -> bool -> string
utf16_lexeme lb bo bom
as Ulexing.lexeme
with a result encoded in
UTF-16 in byte_order bo
and starting with a BOM if bom = true
.
val utf16_sub_lexeme : Ulexing.lexbuf -> int -> int -> byte_order -> bool -> string
utf16_sub_lexeme lb pos len bo bom
as Ulexing.sub_lexeme
with a
result encoded in UTF-16 with byte order bo
and starting with a BOM
if bom=true