-- ------------------------------------------------------------

{- |
   Module     : Data.String.Unicode
   Copyright  : Copyright (C) 2010- Uwe Schmidt
   License    : MIT

   Maintainer : Uwe Schmidt (uwe@fh-wedel.de)
   Stability  : stable
   Portability: portable

   Unicode and UTF-8 Conversion Functions

-}

-- ------------------------------------------------------------

module Data.String.Unicode
    (
     -- * Unicode Type declarations
     Unicode,
     UString,
     UTF8Char,
     UTF8String,
     UStringWithErrors,
     DecodingFct,
     DecodingFctEmbedErrors,

      utf8ToUnicode
    , utf8ToUnicodeEmbedErrors
    , latin1ToUnicode
    , ucs2ToUnicode
    , ucs2BigEndianToUnicode
    , ucs2LittleEndianToUnicode
    , utf16beToUnicode
    , utf16leToUnicode

    , unicodeCharToUtf8

    , unicodeToUtf8
    , unicodeToXmlEntity
    , unicodeToLatin1
    , unicodeRemoveNoneAscii
    , unicodeRemoveNoneLatin1

    , intToCharRef
    , intToCharRefHex
    , intToHexString

    , getDecodingFct
    , getDecodingFctEmbedErrors
    , getOutputEncodingFct

    , normalizeNL
    , guessEncoding

    , getOutputEncodingFct'
    , unicodeCharToUtf8'
    , unicodeCharToXmlEntity'
    , unicodeCharToLatin1'
    )
where

import           Data.Char                         (toUpper)

import           Data.Char.IsoLatinTables
import           Data.Char.Properties.XMLCharProps (isXml1ByteChar,
                                                    isXmlLatin1Char)

import           Data.String.EncodingNames
import           Data.String.UTF8Decoding          (decodeUtf8,
                                                    decodeUtf8EmbedErrors)

-- ------------------------------------------------------------

-- | Unicode is represented as the Char type
--   Precondition for this is the support of Unicode character range
--   in the compiler (e.g. ghc but not hugs)

type Unicode    = Char

-- | the type for Unicode strings

type UString    = [Unicode]

-- | UTF-8 charachters are represented by the Char type

type UTF8Char   = Char

-- | UTF-8 strings are implemented as Haskell strings

type UTF8String = String

-- | Decoding function with a pair containing the result string and a list of decoding errors as result

type DecodingFct = String -> (UString, [String])

type UStringWithErrors = [Either String Char]

-- | Decoding function where decoding errors are interleaved with decoded characters

type DecodingFctEmbedErrors = String -> UStringWithErrors

-- ------------------------------------------------------------

-- |
-- conversion from Unicode strings (UString) to UTF8 encoded strings.

unicodeToUtf8           :: UString -> UTF8String
unicodeToUtf8 :: UString -> UString
unicodeToUtf8           = (Unicode -> UString) -> UString -> UString
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap Unicode -> UString
unicodeCharToUtf8

-- |
-- conversion from Unicode (Char) to a UTF8 encoded string.

unicodeCharToUtf8       :: Unicode -> UTF8String
unicodeCharToUtf8 :: Unicode -> UString
unicodeCharToUtf8 Unicode
c
    | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0          Bool -> Bool -> Bool
&& Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x0000007F        -- 1 byte UTF8 (7 bits)
        = [ Int -> Unicode
forall a. Enum a => Int -> a
toEnum Int
i ]
    | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0x00000080 Bool -> Bool -> Bool
&& Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x000007FF        -- 2 byte UTF8 (5 + 6 bits)
        = [ Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0xC0 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
0x40)
          , Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i                  Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)
          ]
    | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0x00000800 Bool -> Bool -> Bool
&& Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x0000FFFF        -- 3 byte UTF8 (4 + 6 + 6 bits)
        = [ Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0xE0 Int -> Int -> Int
forall a. Num a => a -> a -> a
+  Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`   Int
0x1000)
          , Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`     Int
0x40) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)
          , Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+  Int
i                 Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)
          ]
    | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0x00010000 Bool -> Bool -> Bool
&& Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x001FFFFF        -- 4 byte UTF8 (3 + 6 + 6 + 6 bits)
        = [ Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0xF0 Int -> Int -> Int
forall a. Num a => a -> a -> a
+  Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`    Int
0x40000)
          , Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`     Int
0x1000) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)
          , Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`       Int
0x40) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)
          , Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+  Int
i                   Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)
          ]
    | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0x00200000 Bool -> Bool -> Bool
&& Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x03FFFFFF        -- 5 byte UTF8 (2 + 6 + 6 + 6 + 6 bits)
        = [ Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0xF8 Int -> Int -> Int
forall a. Num a => a -> a -> a
+  Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`  Int
0x1000000)
          , Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`    Int
0x40000) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)
          , Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`     Int
0x1000) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)
          , Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`       Int
0x40) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)
          , Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+  Int
i                   Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)
          ]
    | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0x04000000 Bool -> Bool -> Bool
&& Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x7FFFFFFF        -- 6 byte UTF8 (1 + 6 + 6 + 6 + 6 + 6 bits)
        = [ Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0xFC Int -> Int -> Int
forall a. Num a => a -> a -> a
+  Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
0x40000000)
          , Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`  Int
0x1000000) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)
          , Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`    Int
0x40000) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)
          , Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`     Int
0x1000) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)
          , Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`       Int
0x40) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)
          , Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+  Int
i                   Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)
          ]
    | Bool
otherwise                                 -- other values not supported
        = UString -> UString
forall a. HasCallStack => UString -> a
error (UString
"unicodeCharToUtf8: illegal integer argument " UString -> UString -> UString
forall a. [a] -> [a] -> [a]
++ Int -> UString
forall a. Show a => a -> UString
show Int
i)
    where
    i :: Int
i = Unicode -> Int
forall a. Enum a => a -> Int
fromEnum Unicode
c

-- ------------------------------------------------------------

-- |
-- code conversion from latin1 to Unicode

latin1ToUnicode :: String -> UString
latin1ToUnicode :: UString -> UString
latin1ToUnicode = UString -> UString
forall a. a -> a
id

latinToUnicode  :: [(Char, Char)] -> String -> UString
latinToUnicode :: [(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
tt
    = (Unicode -> Unicode) -> UString -> UString
forall a b. (a -> b) -> [a] -> [b]
map Unicode -> Unicode
charToUni
    where
    charToUni :: Unicode -> Unicode
charToUni Unicode
c =
       ((Unicode, Unicode) -> Unicode -> Unicode)
-> Unicode -> [(Unicode, Unicode)] -> Unicode
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (\(Unicode
src,Unicode
dst) Unicode
r ->
          case Unicode -> Unicode -> Ordering
forall a. Ord a => a -> a -> Ordering
compare Unicode
c Unicode
src of
             Ordering
EQ -> Unicode
dst
             Ordering
LT -> Unicode
c {- not found in table -}
             Ordering
GT -> Unicode
r) Unicode
c [(Unicode, Unicode)]
tt

-- | conversion from ASCII to unicode with check for legal ASCII char set
--
-- Structure of decoding function copied from 'Data.Char.UTF8.decode'.

decodeAscii     :: DecodingFct
decodeAscii :: DecodingFct
decodeAscii
    = ([UString], UString) -> (UString, [UString])
forall a b. (a, b) -> (b, a)
swap (([UString], UString) -> (UString, [UString]))
-> (UString -> ([UString], UString)) -> DecodingFct
forall b c a. (b -> c) -> (a -> b) -> a -> c
. [Either UString Unicode] -> ([UString], UString)
forall a b. [Either a b] -> ([a], [b])
partitionEither ([Either UString Unicode] -> ([UString], UString))
-> (UString -> [Either UString Unicode])
-> UString
-> ([UString], UString)
forall b c a. (b -> c) -> (a -> b) -> a -> c
. UString -> [Either UString Unicode]
decodeAsciiEmbedErrors

decodeAsciiEmbedErrors  :: String -> UStringWithErrors
decodeAsciiEmbedErrors :: UString -> [Either UString Unicode]
decodeAsciiEmbedErrors UString
str
    = ((Unicode, Int) -> Either UString Unicode)
-> [(Unicode, Int)] -> [Either UString Unicode]
forall a b. (a -> b) -> [a] -> [b]
map (\(Unicode
c,Int
pos) -> if Unicode -> Bool
isValid Unicode
c
                         then Unicode -> Either UString Unicode
forall a b. b -> Either a b
Right Unicode
c
                         else UString -> Either UString Unicode
forall a b. a -> Either a b
Left (Unicode -> Int -> UString
forall a a. (Show a, Show a) => a -> a -> UString
toErrStr Unicode
c Int
pos)) [(Unicode, Int)]
posStr
    where
    posStr :: [(Unicode, Int)]
posStr = UString -> [Int] -> [(Unicode, Int)]
forall a b. [a] -> [b] -> [(a, b)]
zip UString
str [(Int
0::Int)..]
    toErrStr :: a -> a -> UString
toErrStr a
errChr a
pos
        = UString
" at input position " UString -> UString -> UString
forall a. [a] -> [a] -> [a]
++ a -> UString
forall a. Show a => a -> UString
show a
pos UString -> UString -> UString
forall a. [a] -> [a] -> [a]
++ UString
": none ASCII char " UString -> UString -> UString
forall a. [a] -> [a] -> [a]
++ a -> UString
forall a. Show a => a -> UString
show a
errChr
    isValid :: Unicode -> Bool
isValid Unicode
x = Unicode
x Unicode -> Unicode -> Bool
forall a. Ord a => a -> a -> Bool
< Unicode
'\x80'

-- |
-- UCS-2 big endian to Unicode conversion

ucs2BigEndianToUnicode  :: String -> UString

ucs2BigEndianToUnicode :: UString -> UString
ucs2BigEndianToUnicode (Unicode
b : Unicode
l : UString
r)
    = Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Unicode -> Int
forall a. Enum a => a -> Int
fromEnum Unicode
b Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
256 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Unicode -> Int
forall a. Enum a => a -> Int
fromEnum Unicode
l) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
: UString -> UString
ucs2BigEndianToUnicode UString
r

ucs2BigEndianToUnicode []
    = []

ucs2BigEndianToUnicode UString
_
    = []                                -- error "illegal UCS-2 byte input sequence with odd length"
                                        -- is ignored (garbage in, garbage out)

-- ------------------------------------------------------------

-- |
-- UCS-2 little endian to Unicode conversion

ucs2LittleEndianToUnicode       :: String -> UString

ucs2LittleEndianToUnicode :: UString -> UString
ucs2LittleEndianToUnicode (Unicode
l : Unicode
b : UString
r)
    = Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Unicode -> Int
forall a. Enum a => a -> Int
fromEnum Unicode
b Int -> Int -> Int
forall a. Num a => a -> a -> a
* Int
256 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Unicode -> Int
forall a. Enum a => a -> Int
fromEnum Unicode
l) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
: UString -> UString
ucs2LittleEndianToUnicode UString
r

ucs2LittleEndianToUnicode []
    = []

ucs2LittleEndianToUnicode [Unicode
_]
    = []                                -- error "illegal UCS-2 byte input sequence with odd length"
                                        -- is ignored

-- ------------------------------------------------------------

-- |
-- UCS-2 to UTF-8 conversion with byte order mark analysis

ucs2ToUnicode           :: String -> UString

ucs2ToUnicode :: UString -> UString
ucs2ToUnicode (Unicode
'\xFE':Unicode
'\xFF':UString
s)         -- 2 byte mark for big endian encoding
    = UString -> UString
ucs2BigEndianToUnicode UString
s

ucs2ToUnicode (Unicode
'\xFF':Unicode
'\xFE':UString
s)         -- 2 byte mark for little endian encoding
    = UString -> UString
ucs2LittleEndianToUnicode UString
s

ucs2ToUnicode UString
s
    = UString -> UString
ucs2BigEndianToUnicode UString
s          -- default: big endian

-- ------------------------------------------------------------

-- |
-- UTF-8 to Unicode conversion with deletion of leading byte order mark, as described in XML standard F.1

utf8ToUnicode           :: DecodingFct

utf8ToUnicode :: DecodingFct
utf8ToUnicode (Unicode
'\xEF':Unicode
'\xBB':Unicode
'\xBF':UString
s)  -- remove byte order mark ( XML standard F.1 )
    = DecodingFct
decodeUtf8 UString
s

utf8ToUnicode UString
s
    = DecodingFct
decodeUtf8 UString
s

utf8ToUnicodeEmbedErrors        :: DecodingFctEmbedErrors

utf8ToUnicodeEmbedErrors :: UString -> [Either UString Unicode]
utf8ToUnicodeEmbedErrors (Unicode
'\xEF':Unicode
'\xBB':Unicode
'\xBF':UString
s)       -- remove byte order mark ( XML standard F.1 )
    = UString -> [Either UString Unicode]
decodeUtf8EmbedErrors UString
s

utf8ToUnicodeEmbedErrors UString
s
    = UString -> [Either UString Unicode]
decodeUtf8EmbedErrors UString
s

-- ------------------------------------------------------------

-- |
-- UTF-16 big endian to UTF-8 conversion with removal of byte order mark

utf16beToUnicode                :: String -> UString

utf16beToUnicode :: UString -> UString
utf16beToUnicode (Unicode
'\xFE':Unicode
'\xFF':UString
s)              -- remove byte order mark
    = UString -> UString
ucs2BigEndianToUnicode UString
s

utf16beToUnicode UString
s
    = UString -> UString
ucs2BigEndianToUnicode UString
s

-- ------------------------------------------------------------

-- |
-- UTF-16 little endian to UTF-8 conversion with removal of byte order mark

utf16leToUnicode                :: String -> UString

utf16leToUnicode :: UString -> UString
utf16leToUnicode (Unicode
'\xFF':Unicode
'\xFE':UString
s)              -- remove byte order mark
    = UString -> UString
ucs2LittleEndianToUnicode UString
s

utf16leToUnicode UString
s
    = UString -> UString
ucs2LittleEndianToUnicode UString
s


-- ------------------------------------------------------------

-- |
-- substitute all Unicode characters, that are not legal 1-byte
-- UTF-8 XML characters by a character reference.
--
-- This function can be used to translate all text nodes and
-- attribute values into pure ascii.
--
-- see also : 'unicodeToLatin1'

unicodeToXmlEntity      :: UString -> String
unicodeToXmlEntity :: UString -> UString
unicodeToXmlEntity
    = (Unicode -> Bool) -> (Unicode -> UString) -> UString -> UString
escape Unicode -> Bool
isXml1ByteChar (Int -> UString
intToCharRef (Int -> UString) -> (Unicode -> Int) -> Unicode -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Unicode -> Int
forall a. Enum a => a -> Int
fromEnum)

-- |
-- substitute all Unicode characters, that are not legal latin1
-- UTF-8 XML characters by a character reference.
--
-- This function can be used to translate all text nodes and
-- attribute values into ISO latin1.
--
-- see also : 'unicodeToXmlEntity'

unicodeToLatin1 :: UString -> String
unicodeToLatin1 :: UString -> UString
unicodeToLatin1
    = (Unicode -> Bool) -> (Unicode -> UString) -> UString -> UString
escape Unicode -> Bool
isXmlLatin1Char (Int -> UString
intToCharRef (Int -> UString) -> (Unicode -> Int) -> Unicode -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Unicode -> Int
forall a. Enum a => a -> Int
fromEnum)


-- |
-- substitute selected characters
-- The @check@ function returns 'True' whenever a character needs to substitution
-- The function @esc@ computes a substitute.

escape :: (Unicode -> Bool) -> (Unicode -> String) -> UString -> String
escape :: (Unicode -> Bool) -> (Unicode -> UString) -> UString -> UString
escape Unicode -> Bool
check Unicode -> UString
esc =
    (Unicode -> UString) -> UString -> UString
forall (t :: * -> *) a b. Foldable t => (a -> [b]) -> t a -> [b]
concatMap (\Unicode
uc -> if Unicode -> Bool
check Unicode
uc then [Unicode
uc] else Unicode -> UString
esc Unicode
uc)

-- |
-- removes all non ascii chars, may be used to transform
-- a document into a pure ascii representation by removing
-- all non ascii chars from tag and attibute names
--
-- see also : 'unicodeRemoveNoneLatin1', 'unicodeToXmlEntity'

unicodeRemoveNoneAscii  :: UString -> String
unicodeRemoveNoneAscii :: UString -> UString
unicodeRemoveNoneAscii
    = (Unicode -> Bool) -> UString -> UString
forall a. (a -> Bool) -> [a] -> [a]
filter Unicode -> Bool
isXml1ByteChar

-- |
-- removes all non latin1 chars, may be used to transform
-- a document into a pure ascii representation by removing
-- all non ascii chars from tag and attibute names
--
-- see also : 'unicodeRemoveNoneAscii', 'unicodeToLatin1'

unicodeRemoveNoneLatin1 :: UString -> String
unicodeRemoveNoneLatin1 :: UString -> UString
unicodeRemoveNoneLatin1
    = (Unicode -> Bool) -> UString -> UString
forall a. (a -> Bool) -> [a] -> [a]
filter Unicode -> Bool
isXmlLatin1Char

-- ------------------------------------------------------------

-- |
-- convert an Unicode into a XML character reference.
--
-- see also : 'intToCharRefHex'

intToCharRef            :: Int -> String
intToCharRef :: Int -> UString
intToCharRef Int
i
    = UString
"&#" UString -> UString -> UString
forall a. [a] -> [a] -> [a]
++ Int -> UString
forall a. Show a => a -> UString
show Int
i UString -> UString -> UString
forall a. [a] -> [a] -> [a]
++ UString
";"

-- |
-- convert an Unicode into a XML hexadecimal character reference.
--
-- see also: 'intToCharRef'

intToCharRefHex         :: Int -> String
intToCharRefHex :: Int -> UString
intToCharRefHex Int
i
    = UString
"&#x" UString -> UString -> UString
forall a. [a] -> [a] -> [a]
++ UString
h2 UString -> UString -> UString
forall a. [a] -> [a] -> [a]
++ UString
";"
      where
      h1 :: UString
h1 = Int -> UString
intToHexString Int
i
      h2 :: UString
h2 = if UString -> Int
forall (t :: * -> *) a. Foldable t => t a -> Int
length UString
h1 Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
2 Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
1
           then Unicode
'0'Unicode -> UString -> UString
forall a. a -> [a] -> [a]
: UString
h1
           else UString
h1

-- ------------------------------------------------------------

intToHexString          :: Int -> String
intToHexString :: Int -> UString
intToHexString Int
i
    | Int
i Int -> Int -> Bool
forall a. Eq a => a -> a -> Bool
== Int
0
        = UString
"0"
    | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
> Int
0
        = Int -> UString
intToStr Int
i
    | Bool
otherwise
        = UString -> UString
forall a. HasCallStack => UString -> a
error (UString
"intToHexString: negative argument " UString -> UString -> UString
forall a. [a] -> [a] -> [a]
++ Int -> UString
forall a. Show a => a -> UString
show Int
i)
    where
    intToStr :: Int -> UString
intToStr Int
0  = UString
""
    intToStr Int
i' = Int -> UString
intToStr (Int
i' Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
16) UString -> UString -> UString
forall a. [a] -> [a] -> [a]
++ [Int -> Unicode
fourBitsToChar (Int
i' Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
16)]

fourBitsToChar          :: Int -> Char
fourBitsToChar :: Int -> Unicode
fourBitsToChar Int
i        = UString
"0123456789ABCDEF" UString -> Int -> Unicode
forall a. [a] -> Int -> a
!! Int
i
{-# INLINE fourBitsToChar #-}

-- ------------------------------------------------------------
--
-- | White Space (XML Standard 2.3) and
-- end of line handling (2.11)
--
-- \#x0D and \#x0D\#x0A are mapped to \#x0A

normalizeNL     :: String -> String
normalizeNL :: UString -> UString
normalizeNL (Unicode
'\r' : Unicode
'\n' : UString
rest)        = Unicode
'\n' Unicode -> UString -> UString
forall a. a -> [a] -> [a]
: UString -> UString
normalizeNL UString
rest
normalizeNL (Unicode
'\r' : UString
rest)               = Unicode
'\n' Unicode -> UString -> UString
forall a. a -> [a] -> [a]
: UString -> UString
normalizeNL UString
rest
normalizeNL (Unicode
c : UString
rest)                  = Unicode
c    Unicode -> UString -> UString
forall a. a -> [a] -> [a]
: UString -> UString
normalizeNL UString
rest
normalizeNL []                          = []


-- ------------------------------------------------------------

-- |
-- the table of supported character encoding schemes and the associated
-- conversion functions into Unicode:q

{-
This table could be derived from decodingTableEither,
but this way it is certainly more efficient.
-}

decodingTable   :: [(String, DecodingFct)]
decodingTable :: [(UString, DecodingFct)]
decodingTable
    = [ (UString
utf8,          DecodingFct
utf8ToUnicode                           )
      , (UString
isoLatin1,     (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct UString -> UString
latin1ToUnicode              )
      , (UString
usAscii,       DecodingFct
decodeAscii                             )
      , (UString
ucs2,          (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct UString -> UString
ucs2ToUnicode                )
      , (UString
utf16,         (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct UString -> UString
ucs2ToUnicode                )
      , (UString
utf16be,       (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct UString -> UString
utf16beToUnicode             )
      , (UString
utf16le,       (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct UString -> UString
utf16leToUnicode             )
      , (UString
iso8859_2,     (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_2)  )
      , (UString
iso8859_3,     (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_3)  )
      , (UString
iso8859_4,     (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_4)  )
      , (UString
iso8859_5,     (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_5)  )
      , (UString
iso8859_6,     (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_6)  )
      , (UString
iso8859_7,     (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_7)  )
      , (UString
iso8859_8,     (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_8)  )
      , (UString
iso8859_9,     (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_9)  )
      , (UString
iso8859_10,    (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_10) )
      , (UString
iso8859_11,    (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_11) )
      , (UString
iso8859_13,    (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_13) )
      , (UString
iso8859_14,    (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_14) )
      , (UString
iso8859_15,    (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_15) )
      , (UString
iso8859_16,    (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_16) )
      , (UString
unicodeString, (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct UString -> UString
forall a. a -> a
id                           )
      , (UString
"",            (UString -> UString) -> DecodingFct
forall t a a. (t -> a) -> t -> (a, [a])
liftDecFct UString -> UString
forall a. a -> a
id                           )       -- default
      ]
    where
    liftDecFct :: (t -> a) -> t -> (a, [a])
liftDecFct t -> a
df = \ t
s -> (t -> a
df t
s, [])

-- |
-- the lookup function for selecting the decoding function

getDecodingFct          :: String -> Maybe DecodingFct
getDecodingFct :: UString -> Maybe DecodingFct
getDecodingFct UString
enc
    = UString -> [(UString, DecodingFct)] -> Maybe DecodingFct
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup ((Unicode -> Unicode) -> UString -> UString
forall a b. (a -> b) -> [a] -> [b]
map Unicode -> Unicode
toUpper UString
enc) [(UString, DecodingFct)]
decodingTable


-- |
-- Similar to 'decodingTable' but it embeds errors
-- in the string of decoded characters.

decodingTableEmbedErrors        :: [(String, DecodingFctEmbedErrors)]
decodingTableEmbedErrors :: [(UString, UString -> [Either UString Unicode])]
decodingTableEmbedErrors
    = [ (UString
utf8,          UString -> [Either UString Unicode]
utf8ToUnicodeEmbedErrors                )
      , (UString
isoLatin1,     (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct UString -> UString
latin1ToUnicode              )
      , (UString
usAscii,       UString -> [Either UString Unicode]
decodeAsciiEmbedErrors                  )
      , (UString
ucs2,          (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct UString -> UString
ucs2ToUnicode                )
      , (UString
utf16,         (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct UString -> UString
ucs2ToUnicode                )
      , (UString
utf16be,       (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct UString -> UString
utf16beToUnicode             )
      , (UString
utf16le,       (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct UString -> UString
utf16leToUnicode             )
      , (UString
iso8859_2,     (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_2)  )
      , (UString
iso8859_3,     (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_3)  )
      , (UString
iso8859_4,     (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_4)  )
      , (UString
iso8859_5,     (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_5)  )
      , (UString
iso8859_6,     (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_6)  )
      , (UString
iso8859_7,     (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_7)  )
      , (UString
iso8859_8,     (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_8)  )
      , (UString
iso8859_9,     (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_9)  )
      , (UString
iso8859_10,    (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_10) )
      , (UString
iso8859_11,    (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_11) )
      , (UString
iso8859_13,    (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_13) )
      , (UString
iso8859_14,    (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_14) )
      , (UString
iso8859_15,    (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_15) )
      , (UString
iso8859_16,    (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct ([(Unicode, Unicode)] -> UString -> UString
latinToUnicode [(Unicode, Unicode)]
iso_8859_16) )
      , (UString
unicodeString, (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct UString -> UString
forall a. a -> a
id                           )
      , (UString
"",            (UString -> UString) -> UString -> [Either UString Unicode]
forall a b a. (a -> [b]) -> a -> [Either a b]
liftDecFct UString -> UString
forall a. a -> a
id                           )       -- default
      ]
    where
    liftDecFct :: (a -> [b]) -> a -> [Either a b]
liftDecFct a -> [b]
df = (b -> Either a b) -> [b] -> [Either a b]
forall a b. (a -> b) -> [a] -> [b]
map b -> Either a b
forall a b. b -> Either a b
Right ([b] -> [Either a b]) -> (a -> [b]) -> a -> [Either a b]
forall b c a. (b -> c) -> (a -> b) -> a -> c
. a -> [b]
df

-- |
-- the lookup function for selecting the decoding function

getDecodingFctEmbedErrors       :: String -> Maybe DecodingFctEmbedErrors
getDecodingFctEmbedErrors :: UString -> Maybe (UString -> [Either UString Unicode])
getDecodingFctEmbedErrors UString
enc
    = UString
-> [(UString, UString -> [Either UString Unicode])]
-> Maybe (UString -> [Either UString Unicode])
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup ((Unicode -> Unicode) -> UString -> UString
forall a b. (a -> b) -> [a] -> [b]
map Unicode -> Unicode
toUpper UString
enc) [(UString, UString -> [Either UString Unicode])]
decodingTableEmbedErrors



-- |
-- the table of supported output encoding schemes and the associated
-- conversion functions from Unicode

outputEncodingTable     :: [(String, (UString -> String))]
outputEncodingTable :: [(UString, UString -> UString)]
outputEncodingTable
    = [ (UString
utf8,          UString -> UString
unicodeToUtf8           )
      , (UString
isoLatin1,     UString -> UString
unicodeToLatin1         )
      , (UString
usAscii,       UString -> UString
unicodeToXmlEntity      )
      , (UString
unicodeString, UString -> UString
forall a. a -> a
id                      )
      , (UString
"",            UString -> UString
unicodeToUtf8           )       -- default
      ]

-- |
-- the lookup function for selecting the encoding function

getOutputEncodingFct            :: String -> Maybe (String -> UString)
getOutputEncodingFct :: UString -> Maybe (UString -> UString)
getOutputEncodingFct UString
enc
    = UString
-> [(UString, UString -> UString)] -> Maybe (UString -> UString)
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup ((Unicode -> Unicode) -> UString -> UString
forall a b. (a -> b) -> [a] -> [b]
map Unicode -> Unicode
toUpper UString
enc) [(UString, UString -> UString)]
outputEncodingTable

-- ------------------------------------------------------------
--

guessEncoding           :: String -> String

guessEncoding :: UString -> UString
guessEncoding (Unicode
'\xFF':Unicode
'\xFE':Unicode
'\x00':Unicode
'\x00':UString
_)   = UString
"UCS-4LE"             -- with byte order mark
guessEncoding (Unicode
'\xFF':Unicode
'\xFE':UString
_)                 = UString
"UTF-16LE"            -- with byte order mark

guessEncoding (Unicode
'\xFE':Unicode
'\xFF':Unicode
'\x00':Unicode
'\x00':UString
_)   = UString
"UCS-4-3421"          -- with byte order mark
guessEncoding (Unicode
'\xFE':Unicode
'\xFF':UString
_)                 = UString
"UTF-16BE"            -- with byte order mark

guessEncoding (Unicode
'\xEF':Unicode
'\xBB':Unicode
'\xBF':UString
_)          = UString
utf8                  -- with byte order mark

guessEncoding (Unicode
'\x00':Unicode
'\x00':Unicode
'\xFE':Unicode
'\xFF':UString
_)   = UString
"UCS-4BE"             -- with byte order mark
guessEncoding (Unicode
'\x00':Unicode
'\x00':Unicode
'\xFF':Unicode
'\xFE':UString
_)   = UString
"UCS-4-2143"          -- with byte order mark

guessEncoding (Unicode
'\x00':Unicode
'\x00':Unicode
'\x00':Unicode
'\x3C':UString
_)   = UString
"UCS-4BE"             -- "<" of "<?xml"
guessEncoding (Unicode
'\x3C':Unicode
'\x00':Unicode
'\x00':Unicode
'\x00':UString
_)   = UString
"UCS-4LE"             -- "<" of "<?xml"
guessEncoding (Unicode
'\x00':Unicode
'\x00':Unicode
'\x3C':Unicode
'\x00':UString
_)   = UString
"UCS-4-2143"          -- "<" of "<?xml"
guessEncoding (Unicode
'\x00':Unicode
'\x3C':Unicode
'\x00':Unicode
'\x00':UString
_)   = UString
"UCS-4-3412"          -- "<" of "<?xml"

guessEncoding (Unicode
'\x00':Unicode
'\x3C':Unicode
'\x00':Unicode
'\x3F':UString
_)   = UString
"UTF-16BE"            -- "<?" of "<?xml"
guessEncoding (Unicode
'\x3C':Unicode
'\x00':Unicode
'\x3F':Unicode
'\x00':UString
_)   = UString
"UTF-16LE"            -- "<?" of "<?xml"

guessEncoding (Unicode
'\x4C':Unicode
'\x6F':Unicode
'\xA7':Unicode
'\x94':UString
_)   = UString
"EBCDIC"              -- "<?xm" of "<?xml"

guessEncoding UString
_                                 = UString
""                    -- no guess

-- ------------------------------------------------------------

swap :: (a,b) -> (b,a)
swap :: (a, b) -> (b, a)
swap (a
x,b
y) = (b
y,a
x)
{-# INLINE swap #-}

partitionEither :: [Either a b] -> ([a], [b])
partitionEither :: [Either a b] -> ([a], [b])
partitionEither =
   (Either a b -> ([a], [b]) -> ([a], [b]))
-> ([a], [b]) -> [Either a b] -> ([a], [b])
forall (t :: * -> *) a b.
Foldable t =>
(a -> b -> b) -> b -> t a -> b
foldr (\Either a b
x ~([a]
ls,[b]
rs) -> (a -> ([a], [b])) -> (b -> ([a], [b])) -> Either a b -> ([a], [b])
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either (\a
l -> (a
la -> [a] -> [a]
forall a. a -> [a] -> [a]
:[a]
ls,[b]
rs)) (\b
r -> ([a]
ls,b
rb -> [b] -> [b]
forall a. a -> [a] -> [a]
:[b]
rs)) Either a b
x) ([],[])
{-# INLINE partitionEither #-}

-- ------------------------------------------------------------

-- output encoding for bytestrings

-- |
-- the table of supported output encoding schemes and the associated
-- conversion functions from Unicode

type StringFct          = String -> String

outputEncodingTable'     :: [(String, (Char -> StringFct))]
outputEncodingTable' :: [(UString, Unicode -> UString -> UString)]
outputEncodingTable'
    = [ (UString
utf8,          Unicode -> UString -> UString
unicodeCharToUtf8'           )
      , (UString
isoLatin1,     Unicode -> UString -> UString
unicodeCharToLatin1'         )
      , (UString
usAscii,       Unicode -> UString -> UString
unicodeCharToXmlEntity'      )
      , (UString
"",            Unicode -> UString -> UString
unicodeCharToUtf8'           )       -- default
      ]

-- |
-- the lookup function for selecting the encoding function

getOutputEncodingFct'            :: String -> Maybe (Char -> StringFct)
getOutputEncodingFct' :: UString -> Maybe (Unicode -> UString -> UString)
getOutputEncodingFct' UString
enc
    = UString
-> [(UString, Unicode -> UString -> UString)]
-> Maybe (Unicode -> UString -> UString)
forall a b. Eq a => a -> [(a, b)] -> Maybe b
lookup ((Unicode -> Unicode) -> UString -> UString
forall a b. (a -> b) -> [a] -> [b]
map Unicode -> Unicode
toUpper UString
enc) [(UString, Unicode -> UString -> UString)]
outputEncodingTable'


-- ------------------------------------------------------------

-- |
-- conversion from Unicode (Char) to a UTF8 encoded string.

unicodeCharToUtf8'      :: Char -> StringFct
unicodeCharToUtf8' :: Unicode -> UString -> UString
unicodeCharToUtf8' Unicode
c
    | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0          Bool -> Bool -> Bool
&& Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x0000007F        -- 1 byte UTF8 (7 bits)
        = (Unicode
c Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:)

    | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0x00000080 Bool -> Bool -> Bool
&& Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x000007FF        -- 2 byte UTF8 (5 + 6 bits)
        = ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0xC0 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
0x40)                   ) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:) (UString -> UString) -> (UString -> UString) -> UString -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
          ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ Int
i                    Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:)

    | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0x00000800 Bool -> Bool -> Bool
&& Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x0000FFFF        -- 3 byte UTF8 (4 + 6 + 6 bits)
        = ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0xE0 Int -> Int -> Int
forall a. Num a => a -> a -> a
+  Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`     Int
0x1000)            ) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:) (UString -> UString) -> (UString -> UString) -> UString -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
          ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`       Int
0x40) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:) (UString -> UString) -> (UString -> UString) -> UString -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
          ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+  Int
i                   Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:)

    | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0x00010000 Bool -> Bool -> Bool
&& Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x001FFFFF        -- 4 byte UTF8 (3 + 6 + 6 + 6 bits) -- extension to encode 21 bit values
        = ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0xF0 Int -> Int -> Int
forall a. Num a => a -> a -> a
+  Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`    Int
0x40000)            ) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:) (UString -> UString) -> (UString -> UString) -> UString -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
          ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`     Int
0x1000) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:) (UString -> UString) -> (UString -> UString) -> UString -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
          ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`       Int
0x40) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:) (UString -> UString) -> (UString -> UString) -> UString -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
          ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+  Int
i                   Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:)

    | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0x00200000 Bool -> Bool -> Bool
&& Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x03FFFFFF        -- 5 byte UTF8 (2 + 6 + 6 + 6 + 6 bits) -- extension to encode 26 bit values
        = ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0xF8 Int -> Int -> Int
forall a. Num a => a -> a -> a
+  Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`  Int
0x1000000)            ) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:) (UString -> UString) -> (UString -> UString) -> UString -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
          ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`    Int
0x40000) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:) (UString -> UString) -> (UString -> UString) -> UString -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
          ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`     Int
0x1000) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:) (UString -> UString) -> (UString -> UString) -> UString -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
          ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`       Int
0x40) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:) (UString -> UString) -> (UString -> UString) -> UString -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
          ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+  Int
i                   Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:)

    | Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
>= Int
0x04000000 Bool -> Bool -> Bool
&& Int
i Int -> Int -> Bool
forall a. Ord a => a -> a -> Bool
<= Int
0x7FFFFFFF        -- 6 byte UTF8 (1 + 6 + 6 + 6 + 6 + 6 bits) -- extension to encode 31 bit values
        = ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0xFC Int -> Int -> Int
forall a. Num a => a -> a -> a
+  Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div` Int
0x40000000)            ) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:) (UString -> UString) -> (UString -> UString) -> UString -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
          ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`  Int
0x1000000) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:) (UString -> UString) -> (UString -> UString) -> UString -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
          ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`    Int
0x40000) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:) (UString -> UString) -> (UString -> UString) -> UString -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
          ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`     Int
0x1000) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:) (UString -> UString) -> (UString -> UString) -> UString -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
          ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+ (Int
i Int -> Int -> Int
forall a. Integral a => a -> a -> a
`div`       Int
0x40) Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:) (UString -> UString) -> (UString -> UString) -> UString -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
.
          ((Int -> Unicode
forall a. Enum a => Int -> a
toEnum (Int
0x80 Int -> Int -> Int
forall a. Num a => a -> a -> a
+  Int
i                   Int -> Int -> Int
forall a. Integral a => a -> a -> a
`mod` Int
0x40)) Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:)

    | Bool
otherwise                                 -- other values not supported
        = UString -> UString -> UString
forall a. HasCallStack => UString -> a
error (UString
"unicodeCharToUtf8: illegal integer argument " UString -> UString -> UString
forall a. [a] -> [a] -> [a]
++ Int -> UString
forall a. Show a => a -> UString
show Int
i)
    where
    i :: Int
i = Unicode -> Int
forall a. Enum a => a -> Int
fromEnum Unicode
c

-- ------------------------------------------------------------

-- |
-- substitute all Unicode characters, that are not legal 1-byte
-- UTF-8 XML characters by a character reference.

unicodeCharToXmlEntity' :: Char -> StringFct
unicodeCharToXmlEntity' :: Unicode -> UString -> UString
unicodeCharToXmlEntity' Unicode
c
    | Unicode -> Bool
isXml1ByteChar Unicode
c  = (Unicode
c Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:)
    | Bool
otherwise         = ((Int -> UString
intToCharRef (Int -> UString) -> (Unicode -> Int) -> Unicode -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Unicode -> Int
forall a. Enum a => a -> Int
fromEnum (Unicode -> UString) -> Unicode -> UString
forall a b. (a -> b) -> a -> b
$ Unicode
c) UString -> UString -> UString
forall a. [a] -> [a] -> [a]
++)

-- ------------------------------------------------------------

-- |
-- substitute all Unicode characters, that are not legal latin1
-- UTF-8 XML characters by a character reference.

unicodeCharToLatin1'    :: Char -> StringFct
unicodeCharToLatin1' :: Unicode -> UString -> UString
unicodeCharToLatin1' Unicode
c
    | Unicode -> Bool
isXmlLatin1Char Unicode
c = (Unicode
c Unicode -> UString -> UString
forall a. a -> [a] -> [a]
:)
    | Bool
otherwise         = ((Int -> UString
intToCharRef (Int -> UString) -> (Unicode -> Int) -> Unicode -> UString
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Unicode -> Int
forall a. Enum a => a -> Int
fromEnum (Unicode -> UString) -> Unicode -> UString
forall a b. (a -> b) -> a -> b
$ Unicode
c) UString -> UString -> UString
forall a. [a] -> [a] -> [a]
++)

-- ------------------------------------------------------------