Changeset 310

Show
Ignore:
Timestamp:
05/02/08 06:30:44 (7 months ago)
Author:
eric.dumin..@gmail.com
Message:

Not indexing content of binary files anymore

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • trunk/lib/picolena/templates/lib/core_exts.rb

    r309 r310  
    6969    content 
    7070  end 
     71   
     72  def self.plain_text?(filename) 
     73    %x{file -i "#{filename}"} =~ /: text\// 
     74  end 
    7175end 
  • trunk/lib/picolena/templates/lib/plain_text_extractors/plain_text.rb

    r263 r310  
    44  aka "plain text file" 
    55  with {|source| 
    6     encoding=File.encoding(source) 
    7     #TODO: Return "binary file" if binary 
    8     if encoding.empty? then 
    9        File.read(source) 
     6    if File.plain_text?(source) then 
     7      encoding=File.encoding(source) 
     8      #TODO: Return "binary file" if binary 
     9      if encoding.empty? then 
     10         File.read(source) 
     11      else 
     12         %x{iconv -f #{encoding} -t utf8  "#{source}" 2>/dev/null} 
     13      end 
    1014    else 
    11       %x{iconv -f #{encoding} -t utf8  "#{source}" 2>/dev/null} 
     15      "binary file!" 
    1216    end 
    1317  } 
  • trunk/lib/picolena/templates/spec/models/finder_spec.rb

    r301 r310  
    124124  end 
    125125 
    126   it "should not index content of binary files" 
    127126 
    128127  # Ferret sometimes SEGFAULT crashed with '*.pdf' queries 
  • trunk/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb

    r301 r310  
    2828    } 
    2929  } 
     30 
     31  it "should not extract content of binary files" do 
     32    PlainTextExtractor.extract_content_from("spec/test_dirs/indexed/others/BIN_FILE_WITHOUT_EXTENSION").should == "binary file!" 
     33  end 
    3034end