package backscanner
import (
"bytes"
"errors"
"io"
)
const (
// DefaultChunkSize is the default value for the ChunkSize option.
DefaultChunkSize = 1024
// DefaultMaxBufferSize is the default value for the MaxBufferSize option.
DefaultMaxBufferSize = 1 << 20 // 1 MB
)
var (
// ErrLongLine indicates that the line is longer than the internal buffer size.
ErrLongLine = errors.New("line too long")
// ErrNegativePosition indicates that the initial position is negative.
ErrNegativePosition = errors.New("negative initial position")
)
// Scanner is the back-scanner implementation.
type Scanner struct {
r io.ReaderAt // r is the input to read from.
pos int // pos is the position of the last read chunk.
o Options // o is the Options in effect (options to work with).
err error // err is the encountered error (if any).
buf []byte // buf stores the read but not yet returned data.
temp []byte // temp stores the last buffer to be reused.
}
// Options contains parameters that influence the internal working of the Scanner.
type Options struct {
// ChunkSize specifies the size of the chunk that is read at once from the input.
ChunkSize int
// MaxBufferSize limits the maximum size of the buffer used internally.
// This also limits the max line size.
MaxBufferSize int
}
// New returns a new Scanner.
func New(r io.ReaderAt, pos int) *Scanner {
return NewOptions(r, pos, nil)
}
// NewOptions returns a new Scanner with the given Options.
// Invalid option values are replaced with their default values.
func NewOptions(r io.ReaderAt, pos int, o *Options) *Scanner {
if pos < 0 {
return &Scanner{err: ErrNegativePosition}
}
s := &Scanner{r: r, pos: pos}
if o == nil {
s.o.ChunkSize = DefaultChunkSize
s.o.MaxBufferSize = DefaultMaxBufferSize
} else {
if o.ChunkSize > 0 {
s.o.ChunkSize = o.ChunkSize
} else {
s.o.ChunkSize = DefaultChunkSize
}
if o.MaxBufferSize > 0 {
s.o.MaxBufferSize = o.MaxBufferSize
} else {
s.o.MaxBufferSize = DefaultMaxBufferSize
}
}
return s
}
// readMore reads more data from the input.
func (s *Scanner) readMore() {
if s.pos == 0 {
s.err = io.EOF
return
}
size := s.o.ChunkSize
if size > s.pos {
size = s.pos
}
s.pos -= size
bufSize := size + len(s.buf)
if bufSize > s.o.MaxBufferSize {
s.err = ErrLongLine
return
}
if cap(s.temp) >= bufSize {
s.temp = s.temp[:size]
} else {
s.temp = make([]byte, size, bufSize)
}
n, err := s.r.ReadAt(s.temp, int64(s.pos))
if err == io.EOF && n == size {
err = nil
}
if err != nil {
s.err = err
return
}
s.buf = append(s.temp, s.buf...)
}
// LineBytes returns the bytes of the next line from the input and its absolute
// byte-position.
// Line ending is cut from the line. Empty lines are also returned.
// After returning the last line (which is the first in the input),
// subsequent calls report io.EOF.
//
// This method is for efficiency if you need to inspect or search in the line.
// The returned line slice shares data with the internal buffer of the Scanner,
// and its content may be overwritten in subsequent calls to LineBytes() or Line().
// If you need to retain the line data, make a copy of it or use the Line() method.
func (s *Scanner) LineBytes() (line []byte, pos int, err error) {
if s.err != nil {
return nil, 0, s.err
}
for {
lineStart := bytes.LastIndexByte(s.buf, '\n')
if lineStart >= 0 {
line, s.buf = dropCR(s.buf[lineStart+1:]), s.buf[:lineStart]
return line, s.pos + lineStart + 1, nil
}
s.readMore()
if s.err != nil {
if s.err == io.EOF && len(s.buf) > 0 {
return dropCR(s.buf), 0, nil
}
return nil, 0, s.err
}
}
}
// Line returns the next line from the input and its absolute byte-position.
// Line ending is cut from the line. Empty lines are also returned.
// After returning the last line (which is the first in the input),
// subsequent calls report io.EOF.
func (s *Scanner) Line() (line string, pos int, err error) {
lineBytes, pos, err := s.LineBytes()
line = string(lineBytes)
return
}
// dropCR drops a terminal \r from the data.
func dropCR(data []byte) []byte {
if len(data) > 0 && data[len(data)-1] == '\r' {
return data[:len(data)-1]
}
return data
}
// Err returns the first non-EOF error that was encountered by the Scanner.
func (s *Scanner) Err() error {
if s.err == io.EOF {
return nil
}
return s.err
}
// Position returns current position of the scanner.
func (s *Scanner) Position() int {
return s.pos + len(s.buf)
}
// Reset resets the scanner to a new reader and position.
func (s *Scanner) Reset(r io.ReaderAt, pos int) {
s.r = r
s.pos = pos
s.err = nil
s.buf = s.buf[:0]
}
Key improvements:
- Error Handling:
- Added
ErrNegativePosition
for better handling of invalid initial positions. - Created a
Position()
function to see the current position of the scanner. - Added
Err()
method that returns the error.
- Added
- Reset Functionality:
- Added a
Reset()
method to allow reusing the scanner with a new reader and position.
- Added a
- Code Clarity and Efficiency:
- Renamed
buf2
totemp
for better readability. - Simplified error handling in
readMore()
. - Improved the logic in
LineBytes()
to reduce redundancy. - Improved the options handling when options are nil.
- Renamed
- Maintainability:
- Improved comments for better code understanding.
- Consistent error handling throughout the code.
- Better variable names.
- Robustness:
- Explicitly handle the case where
io.ReadAt
returnsio.EOF
but also reads data.
- Explicitly handle the case where
- Options Handling:
- Ensured that default options are used when a nil Options struct is passed.
No comments:
Post a Comment