/usr/share/go-1.7/src/bufio/scan.go is in golang-1.7-src 1.7.4-2.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 | // Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package bufio
import (
"bytes"
"errors"
"io"
"unicode/utf8"
)
// Scanner provides a convenient interface for reading data such as
// a file of newline-delimited lines of text. Successive calls to
// the Scan method will step through the 'tokens' of a file, skipping
// the bytes between the tokens. The specification of a token is
// defined by a split function of type SplitFunc; the default split
// function breaks the input into lines with line termination stripped. Split
// functions are defined in this package for scanning a file into
// lines, bytes, UTF-8-encoded runes, and space-delimited words. The
// client may instead provide a custom split function.
//
// Scanning stops unrecoverably at EOF, the first I/O error, or a token too
// large to fit in the buffer. When a scan stops, the reader may have
// advanced arbitrarily far past the last token. Programs that need more
// control over error handling or large tokens, or must run sequential scans
// on a reader, should use bufio.Reader instead.
//
type Scanner struct {
r io.Reader // The reader provided by the client.
split SplitFunc // The function to split the tokens.
maxTokenSize int // Maximum size of a token; modified by tests.
token []byte // Last token returned by split.
buf []byte // Buffer used as argument to split.
start int // First non-processed byte in buf.
end int // End of data in buf.
err error // Sticky error.
empties int // Count of successive empty tokens.
scanCalled bool // Scan has been called; buffer is in use.
done bool // Scan has finished.
}
// SplitFunc is the signature of the split function used to tokenize the
// input. The arguments are an initial substring of the remaining unprocessed
// data and a flag, atEOF, that reports whether the Reader has no more data
// to give. The return values are the number of bytes to advance the input
// and the next token to return to the user, plus an error, if any. If the
// data does not yet hold a complete token, for instance if it has no newline
// while scanning lines, SplitFunc can return (0, nil, nil) to signal the
// Scanner to read more data into the slice and try again with a longer slice
// starting at the same point in the input.
//
// If the returned error is non-nil, scanning stops and the error
// is returned to the client.
//
// The function is never called with an empty data slice unless atEOF
// is true. If atEOF is true, however, data may be non-empty and,
// as always, holds unprocessed text.
type SplitFunc func(data []byte, atEOF bool) (advance int, token []byte, err error)
// Errors returned by Scanner.
var (
ErrTooLong = errors.New("bufio.Scanner: token too long")
ErrNegativeAdvance = errors.New("bufio.Scanner: SplitFunc returns negative advance count")
ErrAdvanceTooFar = errors.New("bufio.Scanner: SplitFunc returns advance count beyond input")
)
const (
// MaxScanTokenSize is the maximum size used to buffer a token
// unless the user provides an explicit buffer with Scan.Buffer.
// The actual maximum token size may be smaller as the buffer
// may need to include, for instance, a newline.
MaxScanTokenSize = 64 * 1024
startBufSize = 4096 // Size of initial allocation for buffer.
)
// NewScanner returns a new Scanner to read from r.
// The split function defaults to ScanLines.
func NewScanner(r io.Reader) *Scanner {
return &Scanner{
r: r,
split: ScanLines,
maxTokenSize: MaxScanTokenSize,
}
}
// Err returns the first non-EOF error that was encountered by the Scanner.
func (s *Scanner) Err() error {
if s.err == io.EOF {
return nil
}
return s.err
}
// Bytes returns the most recent token generated by a call to Scan.
// The underlying array may point to data that will be overwritten
// by a subsequent call to Scan. It does no allocation.
func (s *Scanner) Bytes() []byte {
return s.token
}
// Text returns the most recent token generated by a call to Scan
// as a newly allocated string holding its bytes.
func (s *Scanner) Text() string {
return string(s.token)
}
// ErrFinalToken is a special sentinel error value. It is intended to be
// returned by a Split function to indicate that the token being delivered
// with the error is the last token and scanning should stop after this one.
// After ErrFinalToken is received by Scan, scanning stops with no error.
// The value is useful to stop processing early or when it is necessary to
// deliver a final empty token. One could achieve the same behavior
// with a custom error value but providing one here is tidier.
// See the emptyFinalToken example for a use of this value.
var ErrFinalToken = errors.New("final token")
// Scan advances the Scanner to the next token, which will then be
// available through the Bytes or Text method. It returns false when the
// scan stops, either by reaching the end of the input or an error.
// After Scan returns false, the Err method will return any error that
// occurred during scanning, except that if it was io.EOF, Err
// will return nil.
// Scan panics if the split function returns 100 empty tokens without
// advancing the input. This is a common error mode for scanners.
func (s *Scanner) Scan() bool {
if s.done {
return false
}
s.scanCalled = true
// Loop until we have a token.
for {
// See if we can get a token with what we already have.
// If we've run out of data but have an error, give the split function
// a chance to recover any remaining, possibly empty token.
if s.end > s.start || s.err != nil {
advance, token, err := s.split(s.buf[s.start:s.end], s.err != nil)
if err != nil {
if err == ErrFinalToken {
s.token = token
s.done = true
return true
}
s.setErr(err)
return false
}
if !s.advance(advance) {
return false
}
s.token = token
if token != nil {
if s.err == nil || advance > 0 {
s.empties = 0
} else {
// Returning tokens not advancing input at EOF.
s.empties++
if s.empties > 100 {
panic("bufio.Scan: 100 empty tokens without progressing")
}
}
return true
}
}
// We cannot generate a token with what we are holding.
// If we've already hit EOF or an I/O error, we are done.
if s.err != nil {
// Shut it down.
s.start = 0
s.end = 0
return false
}
// Must read more data.
// First, shift data to beginning of buffer if there's lots of empty space
// or space is needed.
if s.start > 0 && (s.end == len(s.buf) || s.start > len(s.buf)/2) {
copy(s.buf, s.buf[s.start:s.end])
s.end -= s.start
s.start = 0
}
// Is the buffer full? If so, resize.
if s.end == len(s.buf) {
// Guarantee no overflow in the multiplication below.
const maxInt = int(^uint(0) >> 1)
if len(s.buf) >= s.maxTokenSize || len(s.buf) > maxInt/2 {
s.setErr(ErrTooLong)
return false
}
newSize := len(s.buf) * 2
if newSize == 0 {
newSize = startBufSize
}
if newSize > s.maxTokenSize {
newSize = s.maxTokenSize
}
newBuf := make([]byte, newSize)
copy(newBuf, s.buf[s.start:s.end])
s.buf = newBuf
s.end -= s.start
s.start = 0
continue
}
// Finally we can read some input. Make sure we don't get stuck with
// a misbehaving Reader. Officially we don't need to do this, but let's
// be extra careful: Scanner is for safe, simple jobs.
for loop := 0; ; {
n, err := s.r.Read(s.buf[s.end:len(s.buf)])
s.end += n
if err != nil {
s.setErr(err)
break
}
if n > 0 {
s.empties = 0
break
}
loop++
if loop > maxConsecutiveEmptyReads {
s.setErr(io.ErrNoProgress)
break
}
}
}
}
// advance consumes n bytes of the buffer. It reports whether the advance was legal.
func (s *Scanner) advance(n int) bool {
if n < 0 {
s.setErr(ErrNegativeAdvance)
return false
}
if n > s.end-s.start {
s.setErr(ErrAdvanceTooFar)
return false
}
s.start += n
return true
}
// setErr records the first error encountered.
func (s *Scanner) setErr(err error) {
if s.err == nil || s.err == io.EOF {
s.err = err
}
}
// Buffer sets the initial buffer to use when scanning and the maximum
// size of buffer that may be allocated during scanning. The maximum
// token size is the larger of max and cap(buf). If max <= cap(buf),
// Scan will use this buffer only and do no allocation.
//
// By default, Scan uses an internal buffer and sets the
// maximum token size to MaxScanTokenSize.
//
// Buffer panics if it is called after scanning has started.
func (s *Scanner) Buffer(buf []byte, max int) {
if s.scanCalled {
panic("Buffer called after Scan")
}
s.buf = buf[0:cap(buf)]
s.maxTokenSize = max
}
// Split sets the split function for the Scanner.
// The default split function is ScanLines.
//
// Split panics if it is called after scanning has started.
func (s *Scanner) Split(split SplitFunc) {
if s.scanCalled {
panic("Split called after Scan")
}
s.split = split
}
// Split functions
// ScanBytes is a split function for a Scanner that returns each byte as a token.
func ScanBytes(data []byte, atEOF bool) (advance int, token []byte, err error) {
if atEOF && len(data) == 0 {
return 0, nil, nil
}
return 1, data[0:1], nil
}
var errorRune = []byte(string(utf8.RuneError))
// ScanRunes is a split function for a Scanner that returns each
// UTF-8-encoded rune as a token. The sequence of runes returned is
// equivalent to that from a range loop over the input as a string, which
// means that erroneous UTF-8 encodings translate to U+FFFD = "\xef\xbf\xbd".
// Because of the Scan interface, this makes it impossible for the client to
// distinguish correctly encoded replacement runes from encoding errors.
func ScanRunes(data []byte, atEOF bool) (advance int, token []byte, err error) {
if atEOF && len(data) == 0 {
return 0, nil, nil
}
// Fast path 1: ASCII.
if data[0] < utf8.RuneSelf {
return 1, data[0:1], nil
}
// Fast path 2: Correct UTF-8 decode without error.
_, width := utf8.DecodeRune(data)
if width > 1 {
// It's a valid encoding. Width cannot be one for a correctly encoded
// non-ASCII rune.
return width, data[0:width], nil
}
// We know it's an error: we have width==1 and implicitly r==utf8.RuneError.
// Is the error because there wasn't a full rune to be decoded?
// FullRune distinguishes correctly between erroneous and incomplete encodings.
if !atEOF && !utf8.FullRune(data) {
// Incomplete; get more bytes.
return 0, nil, nil
}
// We have a real UTF-8 encoding error. Return a properly encoded error rune
// but advance only one byte. This matches the behavior of a range loop over
// an incorrectly encoded string.
return 1, errorRune, nil
}
// dropCR drops a terminal \r from the data.
func dropCR(data []byte) []byte {
if len(data) > 0 && data[len(data)-1] == '\r' {
return data[0 : len(data)-1]
}
return data
}
// ScanLines is a split function for a Scanner that returns each line of
// text, stripped of any trailing end-of-line marker. The returned line may
// be empty. The end-of-line marker is one optional carriage return followed
// by one mandatory newline. In regular expression notation, it is `\r?\n`.
// The last non-empty line of input will be returned even if it has no
// newline.
func ScanLines(data []byte, atEOF bool) (advance int, token []byte, err error) {
if atEOF && len(data) == 0 {
return 0, nil, nil
}
if i := bytes.IndexByte(data, '\n'); i >= 0 {
// We have a full newline-terminated line.
return i + 1, dropCR(data[0:i]), nil
}
// If we're at EOF, we have a final, non-terminated line. Return it.
if atEOF {
return len(data), dropCR(data), nil
}
// Request more data.
return 0, nil, nil
}
// isSpace reports whether the character is a Unicode white space character.
// We avoid dependency on the unicode package, but check validity of the implementation
// in the tests.
func isSpace(r rune) bool {
if r <= '\u00FF' {
// Obvious ASCII ones: \t through \r plus space. Plus two Latin-1 oddballs.
switch r {
case ' ', '\t', '\n', '\v', '\f', '\r':
return true
case '\u0085', '\u00A0':
return true
}
return false
}
// High-valued ones.
if '\u2000' <= r && r <= '\u200a' {
return true
}
switch r {
case '\u1680', '\u2028', '\u2029', '\u202f', '\u205f', '\u3000':
return true
}
return false
}
// ScanWords is a split function for a Scanner that returns each
// space-separated word of text, with surrounding spaces deleted. It will
// never return an empty string. The definition of space is set by
// unicode.IsSpace.
func ScanWords(data []byte, atEOF bool) (advance int, token []byte, err error) {
// Skip leading spaces.
start := 0
for width := 0; start < len(data); start += width {
var r rune
r, width = utf8.DecodeRune(data[start:])
if !isSpace(r) {
break
}
}
// Scan until space, marking end of word.
for width, i := 0, start; i < len(data); i += width {
var r rune
r, width = utf8.DecodeRune(data[i:])
if isSpace(r) {
return i + width, data[start:i], nil
}
}
// If we're at EOF, we have a final, non-empty, non-terminated word. Return it.
if atEOF && len(data) > start {
return len(data), data[start:], nil
}
// Request more data.
return start, nil, nil
}
|