root/trunk/lib/picolena/templates/lib/plain_text_extractors/html.rb

Revision 178, 0.6 kB (checked in by eric.dumin..@gmail.com, 6 months ago)

Filters have been renamed to PlainTextExtractors?.

Filter is a Rails protected name.

  • Property svn:executable set to *
Line 
1 PlainTextExtractor.new {
2   every :html, :htm
3   as "text/html"
4   aka "HyperText Markup Language document"
5   with {|source|
6     encoding=File.encoding(source)
7     if encoding.empty? or encoding.gsub(/[^\w]/,'').downcase=="utf8" then
8       %x{html2text -nobs "#{source}"}
9     else
10       %x{html2text -nobs "#{source}" | iconv -f #{encoding} -t utf8}
11     end
12   }
13   which_requires 'html2text', 'iconv'
14   which_should_for_example_extract 'zentrum fÃŒr angewandte forschung an fachhochschulen nachhaltige energietechnik Baden-WÃŒrttemberg', :from => 'zafh.net.html'
15   or_extract 'Málaga', :from => '7.html'
16   or_extract 'le monde', :from => 'lemonde.htm'
17 }
Note: See TracBrowser for help on using the browser.