Skip to content

Commit

Permalink
use StringScanner with baseparser
Browse files Browse the repository at this point in the history
[Why]
Using StringScanner reduces the string copying process and speeds up the process.
  • Loading branch information
naitoh committed Jan 6, 2024
1 parent 724ed57 commit 09b7fb9
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 50 deletions.
2 changes: 1 addition & 1 deletion lib/rexml/parseexception.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def to_s
err << "\nLine: #{line}\n"
err << "Position: #{position}\n"
err << "Last 80 unconsumed characters:\n"
err << @source.buffer[0..80].force_encoding("ASCII-8BIT").gsub(/\n/, ' ')
err << @source.scanner.rest[0..80].force_encoding("ASCII-8BIT").gsub(/\n/, ' ')
end

err
Expand Down
35 changes: 15 additions & 20 deletions lib/rexml/parsers/baseparser.rb
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ class BaseParser
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um

NOTATIONDECL_START = /\A\s*<!NOTATION/um
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
Expand Down Expand Up @@ -194,8 +194,6 @@ def pull_event
end
return [ :end_document ] if empty?
return @stack.shift if @stack.size > 0
#STDERR.puts @source.encoding
#STDERR.puts "BUFFER = #{@source.buffer.inspect}"
if @document_status == nil
word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
word = word[1] unless word.nil?
Expand Down Expand Up @@ -259,7 +257,7 @@ def pull_event
else
@document_status = :after_doctype
if @source.encoding == "UTF-8"
@source.buffer.force_encoding(::Encoding::UTF_8)
@source.scanner.string = @source.scanner.rest.force_encoding(::Encoding::UTF_8)
end
end
end
Expand All @@ -274,8 +272,8 @@ def pull_event
return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]

when ENTITY_START
match = @source.match( ENTITYDECL, true ).to_a.compact
match[0] = :entitydecl
match = @source.match( ENTITYDECL, true )
match = match.nil? ? [:entitydecl] : [:entitydecl, *match.captures.compact.reject(&:empty?)]
ref = false
if match[1] == '%'
ref = true
Expand Down Expand Up @@ -349,9 +347,9 @@ def pull_event
@source.match(/\A\s*/um, true)
end
begin
@source.read if @source.buffer.size<2
if @source.buffer[0] == ?<
if @source.buffer[1] == ?/
@source.read if @source.scanner.rest.size<2
if @source.scanner.rest[0] == ?<
if @source.scanner.rest[1] == ?/
@nsstack.shift
last_tag = @tags.pop
md = @source.match( CLOSE_MATCH, true )
Expand All @@ -365,9 +363,8 @@ def pull_event
raise REXML::ParseException.new(message, @source)
end
return [ :end_element, last_tag ]
elsif @source.buffer[1] == ?!
elsif @source.scanner.rest[1] == ?!
md = @source.match(/\A(\s*[^>]*>)/um)
#STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
raise REXML::ParseException.new("Malformed node", @source) unless md
if md[0][2] == ?-
md = @source.match( COMMENT_PATTERN, true )
Expand All @@ -384,14 +381,15 @@ def pull_event
end
raise REXML::ParseException.new( "Declarations can only occur "+
"in the doctype declaration.", @source)
elsif @source.buffer[1] == ??
elsif @source.scanner.rest[1] == ??
return process_instruction
else
# Get the next tag
md = @source.match(TAG_MATCH, true)
unless md
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
end
tag = md[1]
@document_status = :in_element
prefixes = Set.new
prefixes << md[2] if md[2]
Expand All @@ -405,23 +403,20 @@ def pull_event
end

if closed
@closed = md[1]
@closed = tag
@nsstack.shift
else
@tags.push( md[1] )
@tags.push( tag )
end
return [ :start_element, md[1], attributes ]
return [ :start_element, tag, attributes ]
end
else
md = @source.match( TEXT_PATTERN, true )
text = md[1]
if md[0].length == 0
@source.match( /(\s+)/, true )
end
#STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
#return [ :text, "" ] if md[0].length == 0
# unnormalized = Text::unnormalize( md[1], self )
# return PullEvent.new( :text, md[1], unnormalized )
return [ :text, md[1] ]
return [ :text, text ]
end
rescue REXML::UndefinedNamespaceException
raise
Expand Down
69 changes: 40 additions & 29 deletions lib/rexml/source.rb
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ def SourceFactory::create_from(arg)
class Source
include Encoding
# The current buffer (what we're going to read next)
attr_reader :buffer
attr_reader :scanner

# The line number of the last consumed text
attr_reader :line
attr_reader :encoding
Expand All @@ -41,7 +42,8 @@ class Source
# @param encoding if non-null, sets the encoding of the source to this
# value, overriding all encoding detection
def initialize(arg, encoding=nil)
@orig = @buffer = arg
@orig = arg
@scanner = StringScanner.new(@orig)
if encoding
self.encoding = encoding
else
Expand All @@ -62,53 +64,56 @@ def read
end

def match(pattern, cons=false)
md = pattern.match(@buffer)
@buffer = $' if cons and md
return md
if cons
@scanner.scan(pattern)
else
@scanner.check(pattern)
end
@scanner.matched? ? @scanner : nil
end

# @return true if the Source is exhausted
def empty?
@buffer == ""
@scanner.eos?
end

# @return the current line in the source
def current_line
lines = @orig.split
res = lines.grep @buffer[0..30]
res = lines.grep @scanner.rest[0..30]
res = res[-1] if res.kind_of? Array
lines.index( res ) if res
end

private
def detect_encoding
buffer_encoding = @buffer.encoding
orig_encoding = @orig.encoding
detected_encoding = "UTF-8"
begin
@buffer.force_encoding("ASCII-8BIT")
if @buffer[0, 2] == "\xfe\xff"
@buffer[0, 2] = ""
@orig.force_encoding("ASCII-8BIT")
if @orig[0, 2] == "\xfe\xff"
@orig[0, 2] = ""
detected_encoding = "UTF-16BE"
elsif @buffer[0, 2] == "\xff\xfe"
@buffer[0, 2] = ""
elsif @orig[0, 2] == "\xff\xfe"
@orig[0, 2] = ""
detected_encoding = "UTF-16LE"
elsif @buffer[0, 3] == "\xef\xbb\xbf"
@buffer[0, 3] = ""
elsif @orig[0, 3] == "\xef\xbb\xbf"
@orig[0, 3] = ""
detected_encoding = "UTF-8"
end
ensure
@buffer.force_encoding(buffer_encoding)
@orig.force_encoding(orig_encoding)
end
self.encoding = detected_encoding
end

def encoding_updated
if @encoding != 'UTF-8'
@buffer = decode(@buffer)
@scanner.string = decode(@scanner.rest)
@to_utf = true
else
@to_utf = false
@buffer.force_encoding ::Encoding::UTF_8
@scanner.string = @scanner.rest.force_encoding(::Encoding::UTF_8)
end
end
end
Expand All @@ -131,7 +136,7 @@ def initialize(arg, block_size=500, encoding=nil)
end

if !@to_utf and
@buffer.respond_to?(:force_encoding) and
@orig.respond_to?(:force_encoding) and
@source.respond_to?(:external_encoding) and
@source.external_encoding != ::Encoding::UTF_8
@force_utf8 = true
Expand All @@ -142,26 +147,32 @@ def initialize(arg, block_size=500, encoding=nil)

def read
begin
@buffer << readline
@scanner << readline
rescue Exception, NameError
@source = nil
end
end

def match( pattern, cons=false )
rv = pattern.match(@buffer)
@buffer = $' if cons and rv
while !rv and @source
if cons
@scanner.scan(pattern)
else
@scanner.check(pattern)
end
while !@scanner.matched? and @source
begin
@buffer << readline
rv = pattern.match(@buffer)
@buffer = $' if cons and rv
@scanner << readline
if cons
@scanner.scan(pattern)
else
@scanner.check(pattern)
end
rescue
@source = nil
end
end
rv.taint if RUBY_VERSION < '2.7'
rv
@scanner.taint if RUBY_VERSION < '2.7'
@scanner.matched? ? @scanner : nil
end

def empty?
Expand Down Expand Up @@ -218,7 +229,7 @@ def encoding_updated
@source.set_encoding(@encoding, @encoding)
end
@line_break = encode(">")
@pending_buffer, @buffer = @buffer, ""
@pending_buffer, @scanner.string = @scanner.rest, ""
@pending_buffer.force_encoding(@encoding)
super
end
Expand Down

0 comments on commit 09b7fb9

Please sign in to comment.