Module: RDF::RDFa::Reader::Nokogiri

Defined in:
vendor/bundler/ruby/2.5.0/bundler/gems/rdf-rdfa-64ca6c8311da/lib/rdf/rdfa/reader/nokogiri.rb

Overview

Nokogiri implementation of an XML parser.

Defined Under Namespace

Classes: NodeProxy, NodeSetProxy

Class Method Summary collapse

Instance Method Summary collapse

Class Method Details

.librarySymbol

Returns the name of the underlying XML library.

Returns:



12
13
14
# File 'vendor/bundler/ruby/2.5.0/bundler/gems/rdf-rdfa-64ca6c8311da/lib/rdf/rdfa/reader/nokogiri.rb', line 12

def self.library
  :nokogiri
end

Instance Method Details

#detect_host_language_version(input, options) ⇒ Object

Determine the host language and/or version from options and the input document



194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
# File 'vendor/bundler/ruby/2.5.0/bundler/gems/rdf-rdfa-64ca6c8311da/lib/rdf/rdfa/reader/nokogiri.rb', line 194

def detect_host_language_version(input, options)
  @host_language = options[:host_language] ? options[:host_language].to_sym : nil
  @version = options[:version] ? options[:version].to_sym : nil
  return if @host_language && @version

  # Snif version based on input
  case input
  when ::Nokogiri::XML::Document, ::Nokogiri::HTML::Document
    doc_type_string = input.children.detect {|c| c.is_a?(::Nokogiri::XML::DTD)}
    version_attr = input.root && input.root.attribute("version").to_s
    root_element = input.root.name.downcase
    root_namespace = input.root.namespace.to_s
    root_attrs = input.root.attributes
    content_type = case
    when root_element == "html" && input.is_a?(::Nokogiri::HTML::Document)
      "text/html"
    when root_element == "html" && input.is_a?(::Nokogiri::XML::Document)
      "application/xhtml+html"
    end
  else
    content_type = input.content_type if input.respond_to?(:content_type)

    # Determine from head of document
    head = if input.respond_to?(:read)
      input.rewind
      string = input.read(1000)
      input.rewind
      string.to_s
    else
      input.to_s[0..1000]
    end

    doc_type_string = head.match(%r(<!DOCTYPE[^>]*>)m).to_s
    root = head.match(%r(<[^!\?>]*>)m).to_s
    root_element = root.match(%r(^<(\S+)[ >])) ? $1 : ""
    version_attr = root.match(/version\s+=\s+(\S+)[\s">]/m) ? $1 : ""
    head_element = head.match(%r(<head.*<\/head>)mi)
    head_doc = ::Nokogiri::HTML.parse(head_element.to_s)

    # May determine content-type and/or charset from meta
    # Easist way is to parse head into a document and iterate
    # of CSS matches
    head_doc.css("meta").each do |e|
      if e.attr("http-equiv").to_s.downcase == 'content-type'
        content_type, e = e.attr("content").to_s.downcase.split(";")
        options[:encoding] = $1.downcase if e.to_s =~ /charset=([^\s]*)$/i
      elsif e.attr("charset")
        options[:encoding] = e.attr("charset").to_s.downcase
      end
    end
  end

  # Already using XML parser, determine from DOCTYPE and/or root element
  @version ||= :"rdfa1.0" if doc_type_string =~ /RDFa 1\.0/
  @version ||= :"rdfa1.0" if version_attr =~ /RDFa 1\.0/
  @version ||= :"rdfa1.1" if version_attr =~ /RDFa 1\.1/
  @version ||= :"rdfa1.1"

  @host_language ||= :xhtml1 if @version == :"rdfa1.0" && doc_type_string =~ /html/i

  @host_language ||= case content_type
  when "application/xml"  then :xml
  when "image/svg+xml"    then :svg
  when "text/html"
    case doc_type_string
    when /html 4/i        then :html4
    when /xhtml/i         then :xhtml1
    when /html/i          then :html5
    else                       :html5
    end
  when "application/xhtml+xml"
    case doc_type_string
    when /html 4/i        then :html4
    when /xhtml/i         then :xhtml1
    when /html/i          then :xhtml5
    else                       :xhtml5
    end
  else
    case root_element
    when /svg/i           then :svg
    else                       :html5
    end
  end
end

#doc_base(base) ⇒ String

Find value of document base

Parameters:

  • base (String)

    Existing base from URI or :base_uri

Returns:



304
305
306
307
308
309
310
311
312
313
314
315
316
# File 'vendor/bundler/ruby/2.5.0/bundler/gems/rdf-rdfa-64ca6c8311da/lib/rdf/rdfa/reader/nokogiri.rb', line 304

def doc_base(base)
  # find if the document has a base element
  case @host_language
  when :xhtml1, :xhtml5, :html4, :html5
    base_el = @doc.at_css("html>head>base")
    base = base.join(base_el.attribute("href").to_s.split("#").first) if base_el
  else
    xml_base = root.attribute_with_ns("base", RDF::XML.to_s) if root
    base = base.join(xml_base) if xml_base
  end

  base
end

#doc_errorsObject

Document errors



289
290
291
292
293
294
295
296
297
# File 'vendor/bundler/ruby/2.5.0/bundler/gems/rdf-rdfa-64ca6c8311da/lib/rdf/rdfa/reader/nokogiri.rb', line 289

def doc_errors
  # FIXME: Nokogiri version 1.5 thinks many HTML5 elements are invalid, so just ignore all Tag errors.
  # Nokogumbo might make this simpler
  if @host_language == :html5
    @doc.errors.reject {|e| e.to_s =~ /The doctype must be the first token in the document/}
  else
    @doc.errors.reject {|e| e.to_s =~ /(?:Tag \w+ invalid)|(?:Missing attribute name)/}
  end
end

#initialize_xml(input, options = {})

This method returns an undefined value.

Initializes the underlying XML library.

Parameters:



163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# File 'vendor/bundler/ruby/2.5.0/bundler/gems/rdf-rdfa-64ca6c8311da/lib/rdf/rdfa/reader/nokogiri.rb', line 163

def initialize_xml(input, options = {})
  require 'nokogiri' unless defined?(::Nokogiri)
  @doc = case input
  when ::Nokogiri::HTML::Document, ::Nokogiri::XML::Document
    input
  else
    # Try to detect charset from input
    options[:encoding] ||= input.charset if input.respond_to?(:charset)

    # Otherwise, default is utf-8
    options[:encoding] ||= 'utf-8'
    options[:encoding] = options[:encoding].to_s if options[:encoding]

    case @host_language
    when :html4
      ::Nokogiri::HTML.parse(input, base_uri.to_s, options[:encoding])
    when :html5
      begin
        require 'nokogumbo' unless defined?(::Nokogumbo)
        input = input.read if input.respond_to?(:read)
        ::Nokogiri::HTML5(input.force_encoding(options[:encoding]), max_parse_errors: 1000)
      rescue LoadError
        ::Nokogiri::HTML.parse(input, base_uri.to_s, options[:encoding])
      end
    else
      ::Nokogiri::XML.parse(input, base_uri.to_s, options[:encoding])
    end
  end
end

#rootObject

Return proxy for document root



283
284
285
# File 'vendor/bundler/ruby/2.5.0/bundler/gems/rdf-rdfa-64ca6c8311da/lib/rdf/rdfa/reader/nokogiri.rb', line 283

def root
  @root ||= NodeProxy.new(@doc.root) if @doc && @doc.root
end