Class Yapra::Plugin::Filter::EntryFullText
In: lib-plugins/yapra/plugin/filter/entry_full_text.rb
Parent: Yapra::Plugin::MechanizeBase

Filter::EntryFullText — Yuanying

get the entry full text from page with WWW::Mechanize.

    - module: Filter::EntryFullText
      config:
        regexp: http://www\.pixiv\.net/*
        extract_xpath:
          title: '//title/text()'
          dc_creator: "//div[@id='profile']/div/text()"
          author: "//div[@id='profile']/div/text()"
          description: "//div[@id='content2']"
        apply_template_after_extracted:
          content_encoded: '<div><%= title %></div>'

Methods

run  

Public Instance methods

[Source]

    # File lib-plugins/yapra/plugin/filter/entry_full_text.rb, line 20
20:     def run(data)
21:       regexp = nil
22:       if config['regexp']
23:         regexp = Regexp.new(config['regexp'])
24:       else
25:         regexp = /^(https?|ftp)(:\/\/[-_.!~*\'()a-zA-Z0-9;\/?:\@&=+\$,%#]+)$/
26:       end
27:       
28:       wait = config['wait'] || 1
29:       
30:       data.map! do |item|
31:         url = item
32:         if item.respond_to?('link')
33:           url = item.link
34:         end
35: 
36:         if regexp =~ url
37:           logger.debug "Process: #{url}"
38:           page = agent.get(url)
39:           sleep wait
40: 
41:           unless(item.instance_of?(RSS::RDF::Item))
42:             new_item = RSS::RDF::Item.new
43:             new_item.title = item.title rescue item.to_s
44:             new_item.date = item.date rescue Time.now
45:             new_item.description = item.description rescue item.to_s
46:             new_item.link = item.link rescue '#'
47:             item = new_item
48:           end
49: 
50:           extract_attribute_from page.root, item, binding
51: 
52:         end
53:         item
54:       end
55:       
56:       data
57:     end

[Validate]