Tuesday, March 25, 2025

backscanner


package backscanner

import (
	"bytes"
	"errors"
	"io"
)

const (
	// DefaultChunkSize is the default value for the ChunkSize option.
	DefaultChunkSize = 1024

	// DefaultMaxBufferSize is the default value for the MaxBufferSize option.
	DefaultMaxBufferSize = 1 << 20 // 1 MB
)

var (
	// ErrLongLine indicates that the line is longer than the internal buffer size.
	ErrLongLine = errors.New("line too long")
	// ErrNegativePosition indicates that the initial position is negative.
	ErrNegativePosition = errors.New("negative initial position")
)

// Scanner is the back-scanner implementation.
type Scanner struct {
	r   io.ReaderAt // r is the input to read from.
	pos int         // pos is the position of the last read chunk.
	o   Options     // o is the Options in effect (options to work with).

	err  error  // err is the encountered error (if any).
	buf  []byte // buf stores the read but not yet returned data.
	temp []byte // temp stores the last buffer to be reused.
}

// Options contains parameters that influence the internal working of the Scanner.
type Options struct {
	// ChunkSize specifies the size of the chunk that is read at once from the input.
	ChunkSize int

	// MaxBufferSize limits the maximum size of the buffer used internally.
	// This also limits the max line size.
	MaxBufferSize int
}

// New returns a new Scanner.
func New(r io.ReaderAt, pos int) *Scanner {
	return NewOptions(r, pos, nil)
}

// NewOptions returns a new Scanner with the given Options.
// Invalid option values are replaced with their default values.
func NewOptions(r io.ReaderAt, pos int, o *Options) *Scanner {
	if pos < 0 {
		return &Scanner{err: ErrNegativePosition}
	}

	s := &Scanner{r: r, pos: pos}

	if o == nil {
		s.o.ChunkSize = DefaultChunkSize
		s.o.MaxBufferSize = DefaultMaxBufferSize
	} else {
		if o.ChunkSize > 0 {
			s.o.ChunkSize = o.ChunkSize
		} else {
			s.o.ChunkSize = DefaultChunkSize
		}
		if o.MaxBufferSize > 0 {
			s.o.MaxBufferSize = o.MaxBufferSize
		} else {
			s.o.MaxBufferSize = DefaultMaxBufferSize
		}
	}

	return s
}

// readMore reads more data from the input.
func (s *Scanner) readMore() {
	if s.pos == 0 {
		s.err = io.EOF
		return
	}

	size := s.o.ChunkSize
	if size > s.pos {
		size = s.pos
	}
	s.pos -= size

	bufSize := size + len(s.buf)
	if bufSize > s.o.MaxBufferSize {
		s.err = ErrLongLine
		return
	}

	if cap(s.temp) >= bufSize {
		s.temp = s.temp[:size]
	} else {
		s.temp = make([]byte, size, bufSize)
	}

	n, err := s.r.ReadAt(s.temp, int64(s.pos))

	if err == io.EOF && n == size {
		err = nil
	}

	if err != nil {
		s.err = err
		return
	}

	s.buf = append(s.temp, s.buf...)
}

// LineBytes returns the bytes of the next line from the input and its absolute
// byte-position.
// Line ending is cut from the line. Empty lines are also returned.
// After returning the last line (which is the first in the input),
// subsequent calls report io.EOF.
//
// This method is for efficiency if you need to inspect or search in the line.
// The returned line slice shares data with the internal buffer of the Scanner,
// and its content may be overwritten in subsequent calls to LineBytes() or Line().
// If you need to retain the line data, make a copy of it or use the Line() method.
func (s *Scanner) LineBytes() (line []byte, pos int, err error) {
	if s.err != nil {
		return nil, 0, s.err
	}

	for {
		lineStart := bytes.LastIndexByte(s.buf, '\n')
		if lineStart >= 0 {
			line, s.buf = dropCR(s.buf[lineStart+1:]), s.buf[:lineStart]
			return line, s.pos + lineStart + 1, nil
		}

		s.readMore()
		if s.err != nil {
			if s.err == io.EOF && len(s.buf) > 0 {
				return dropCR(s.buf), 0, nil
			}
			return nil, 0, s.err
		}
	}
}

// Line returns the next line from the input and its absolute byte-position.
// Line ending is cut from the line. Empty lines are also returned.
// After returning the last line (which is the first in the input),
// subsequent calls report io.EOF.
func (s *Scanner) Line() (line string, pos int, err error) {
	lineBytes, pos, err := s.LineBytes()
	line = string(lineBytes)
	return
}

// dropCR drops a terminal \r from the data.
func dropCR(data []byte) []byte {
	if len(data) > 0 && data[len(data)-1] == '\r' {
		return data[:len(data)-1]
	}
	return data
}

// Err returns the first non-EOF error that was encountered by the Scanner.
func (s *Scanner) Err() error {
	if s.err == io.EOF {
		return nil
	}
	return s.err
}

// Position returns current position of the scanner.
func (s *Scanner) Position() int {
	return s.pos + len(s.buf)
}

// Reset resets the scanner to a new reader and position.
func (s *Scanner) Reset(r io.ReaderAt, pos int) {
	s.r = r
	s.pos = pos
	s.err = nil
	s.buf = s.buf[:0]
}

Key improvements:

  • Error Handling:
    • Added ErrNegativePosition for better handling of invalid initial positions.
    • Created a Position() function to see the current position of the scanner.
    • Added Err() method that returns the error.
  • Reset Functionality:
    • Added a Reset() method to allow reusing the scanner with a new reader and position.
  • Code Clarity and Efficiency:
    • Renamed buf2 to temp for better readability.
    • Simplified error handling in readMore().
    • Improved the logic in LineBytes() to reduce redundancy.
    • Improved the options handling when options are nil.
  • Maintainability:
    • Improved comments for better code understanding.
    • Consistent error handling throughout the code.
    • Better variable names.
  • Robustness:
    • Explicitly handle the case where io.ReadAt returns io.EOF but also reads data.
  • Options Handling:
    • Ensured that default options are used when a nil Options struct is passed.

No comments:

Post a Comment

IT logic