/usr/share/julia/base/unicode/checkstring.jl is in julia-common 0.4.7-6.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 | # This file is a part of Julia. License is MIT: http://julialang.org/license
## Functions to check validity of UTF-8, UTF-16, and UTF-32 encoded strings,
# and also to return information necessary to convert to other encodings
is_surrogate_lead(c::Unsigned) = ((c & ~0x003ff) == 0xd800)
is_surrogate_trail(c::Unsigned) = ((c & ~0x003ff) == 0xdc00)
is_surrogate_codeunit(c::Unsigned) = ((c & ~0x007ff) == 0xd800)
is_valid_continuation(c) = ((c & 0xc0) == 0x80)
## Return flags for check_string function
const UTF_LONG = 1 ##< Long encodings are present
const UTF_LATIN1 = 2 ##< characters in range 0x80-0xFF present
const UTF_UNICODE2 = 4 ##< characters in range 0x100-0x7ff present
const UTF_UNICODE3 = 8 ##< characters in range 0x800-0xd7ff, 0xe000-0xffff
const UTF_UNICODE4 = 16 ##< non-BMP characters present
const UTF_SURROGATE = 32 ##< surrogate pairs present
## Get a UTF-8 continuation byte, give error if invalid, return updated character value
@inline function get_continuation(ch::UInt32, byt::UInt8, pos)
if !is_valid_continuation(byt)
throw(UnicodeError(UTF_ERR_CONT, pos, byt))
end
(ch << 6) | (byt & 0x3f)
end
"""
Validates and calculates number of characters in a UTF-8,UTF-16 or UTF-32 encoded vector/string
Warning: this function does not check the bounds of the start or end positions
Use `checkstring` to make sure the bounds are checked
Input Arguments:
* `dat` UTF-8 (`Vector{UInt8}`), UTF-16 (`Vector{UInt16}`) or UTF-32 (`Vector{UInt32}`, `AbstractString`) encoded string
Optional Input Arguments:
* `pos` start position (defaults to `start(dat)`)
* `endpos` end position (defaults to `endof(dat)`)
Keyword Arguments:
* `accept_long_null` = `true` # Modified UTF-8 (`\\0` represented as `b\"\\xc0\\x80\"`)
* `accept_surrogates` = `true` # `CESU-8`
* `accept_long_char` = `false` # Accept arbitrary long encodings
Returns:
* (total characters, flags, 4-byte, 3-byte, 2-byte)
Throws:
* `UnicodeError`
"""
function unsafe_checkstring end
function unsafe_checkstring(dat::Vector{UInt8},
pos = start(dat),
endpos = endof(dat)
;
accept_long_null = true,
accept_surrogates = true,
accept_long_char = false)
local byt::UInt8, ch::UInt32, surr::UInt32
flags::UInt = 0
totalchar = num2byte = num3byte = num4byte = 0
@inbounds while pos <= endpos
ch, pos = next(dat, pos)
totalchar += 1
if ch > 0x7f
# Check UTF-8 encoding
if ch < 0xe0
# 2-byte UTF-8 sequence (i.e. characters 0x80-0x7ff)
(pos > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
byt, pos = next(dat, pos)
ch = get_continuation(ch & 0x3f, byt, pos)
if ch > 0x7f
num2byte += 1
flags |= (ch > 0xff) ? UTF_UNICODE2 : UTF_LATIN1
elseif accept_long_char
flags |= UTF_LONG
elseif (ch == 0) && accept_long_null
flags |= UTF_LONG
else
throw(UnicodeError(UTF_ERR_LONG, pos, ch))
end
elseif ch < 0xf0
# 3-byte UTF-8 sequence (i.e. characters 0x800-0xffff)
(pos + 1 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
byt, pos = next(dat, pos)
ch = get_continuation(ch & 0x0f, byt, pos)
byt, pos = next(dat, pos)
ch = get_continuation(ch, byt, pos)
# check for surrogate pairs, make sure correct
if is_surrogate_codeunit(ch)
!is_surrogate_lead(ch) && throw(UnicodeError(UTF_ERR_NOT_LEAD, pos-2, ch))
# next character *must* be a trailing surrogate character
(pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos-2, ch))
byt, pos = next(dat, pos)
(byt != 0xed) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, byt))
byt, pos = next(dat, pos)
surr = get_continuation(0x0000d, byt, pos)
byt, pos = next(dat, pos)
surr = get_continuation(surr, byt, pos)
!is_surrogate_trail(surr) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos-2, surr))
!accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos-2, surr))
flags |= UTF_SURROGATE
num4byte += 1
elseif ch > 0x07ff
num3byte += 1
elseif accept_long_char
flags |= UTF_LONG
num2byte += 1
else
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
end
elseif ch < 0xf5
# 4-byte UTF-8 sequence (i.e. characters > 0xffff)
(pos + 2 > endpos) && throw(UnicodeError(UTF_ERR_SHORT, pos, ch))
byt, pos = next(dat, pos)
ch = get_continuation(ch & 0x07, byt, pos)
byt, pos = next(dat, pos)
ch = get_continuation(ch, byt, pos)
byt, pos = next(dat, pos)
ch = get_continuation(ch, byt, pos)
if ch > 0x10ffff
throw(UnicodeError(UTF_ERR_INVALID, pos-3, ch))
elseif ch > 0xffff
num4byte += 1
elseif is_surrogate_codeunit(ch)
throw(UnicodeError(UTF_ERR_SURROGATE, pos-3, ch))
elseif accept_long_char
# This is an overly long encoded character
flags |= UTF_LONG
if ch > 0x7ff
num3byte += 1
elseif ch > 0x7f
num2byte += 1
end
else
throw(UnicodeError(UTF_ERR_LONG, pos-2, ch))
end
else
throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
end
end
end
num3byte != 0 && (flags |= UTF_UNICODE3)
num4byte != 0 && (flags |= UTF_UNICODE4)
return totalchar, flags, num4byte, num3byte, num2byte
end
function unsafe_checkstring{T <: Union{Vector{UInt16}, Vector{UInt32}, AbstractString}}(
dat::T,
pos = start(dat),
endpos = endof(dat)
;
accept_long_null = true,
accept_surrogates = true,
accept_long_char = false)
local ch::UInt32
flags::UInt = 0
totalchar = num2byte = num3byte = num4byte = 0
@inbounds while pos <= endpos
ch, pos = next(dat, pos)
totalchar += 1
if ch > 0x7f
if ch < 0x100
num2byte += 1
flags |= UTF_LATIN1
elseif ch < 0x800
num2byte += 1
flags |= UTF_UNICODE2
elseif ch > 0x0ffff
(ch > 0x10ffff) && throw(UnicodeError(UTF_ERR_INVALID, pos, ch))
num4byte += 1
elseif !is_surrogate_codeunit(ch)
num3byte += 1
elseif is_surrogate_lead(ch)
pos > endpos && throw(UnicodeError(UTF_ERR_MISSING_SURROGATE, pos, ch))
# next character *must* be a trailing surrogate character
ch, pos = next(dat, pos)
!is_surrogate_trail(ch) && throw(UnicodeError(UTF_ERR_NOT_TRAIL, pos, ch))
num4byte += 1
if T != Vector{UInt16}
!accept_surrogates && throw(UnicodeError(UTF_ERR_SURROGATE, pos, ch))
flags |= UTF_SURROGATE
end
else
throw(UnicodeError(UTF_ERR_NOT_LEAD, pos, ch))
end
end
end
num3byte != 0 && (flags |= UTF_UNICODE3)
num4byte != 0 && (flags |= UTF_UNICODE4)
return totalchar, flags, num4byte, num3byte, num2byte
end
"""
Validates and calculates number of characters in a UTF-8,UTF-16 or UTF-32 encoded vector/string
This function checks the bounds of the start and end positions
Use `unsafe_checkstring` to avoid that overhead if the bounds have already been checked
Input Arguments:
* `dat` UTF-8 (`Vector{UInt8}`), UTF-16 (`Vector{UInt16}`) or UTF-32 (`Vector{UInt32}`, `AbstractString`) encoded string
Optional Input Arguments:
* `startpos` start position (defaults to `start(dat)`)
* `endpos` end position (defaults to `endof(dat)`)
Keyword Arguments:
* `accept_long_null` = `true` # Modified UTF-8 (`\\0` represented as `b\"\\xc0\\x80\"`)
* `accept_surrogates` = `true` # `CESU-8`
* `accept_long_char` = `false` # Accept arbitrary long encodings
Returns:
* (total characters, flags, 4-byte, 3-byte, 2-byte)
Throws:
* `UnicodeError`
"""
function checkstring end
# No need to check bounds if using defaults
checkstring(dat; kwargs...) = unsafe_checkstring(dat, start(dat), endof(dat); kwargs...)
# Make sure that beginning and end positions are bounds checked
function checkstring(dat, startpos, endpos = endof(dat); kwargs...)
checkbounds(dat,startpos)
checkbounds(dat,endpos)
endpos < startpos && throw(ArgumentError("End position ($endpos) is less than start position ($startpos)"))
unsafe_checkstring(dat, startpos, endpos; kwargs...)
end
|