Changeset 178
- Timestamp:
- 04/20/08 02:25:44 (7 months ago)
- Files:
-
- branches/oo_indexer/README.txt (modified) (1 diff)
- branches/oo_indexer/lib/picolena/templates/app/models/document.rb (modified) (2 diffs)
- branches/oo_indexer/lib/picolena/templates/app/models/indexer.rb (modified) (1 diff)
- branches/oo_indexer/lib/picolena/templates/app/models/plain_text_extractor.rb (moved) (moved from branches/oo_indexer/lib/picolena/templates/app/models/filter.rb) (1 diff)
- branches/oo_indexer/lib/picolena/templates/config/initializers/004_load_filters.rb (modified) (1 diff)
- branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractor_DSL.rb (moved) (moved from branches/oo_indexer/lib/picolena/templates/lib/filter_DSL.rb) (5 diffs)
- branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors (moved) (moved from branches/oo_indexer/lib/picolena/templates/lib/filters)
- branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb (modified) (1 diff)
- branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/html.rb (modified) (1 diff)
- branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb (modified) (2 diffs)
- branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb (modified) (2 diffs)
- branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb (modified) (1 diff)
- branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/ms.word.rb (modified) (2 diffs)
- branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/opendocument.presentation.rb (modified) (1 diff)
- branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb (modified) (1 diff)
- branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb (modified) (1 diff)
- branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/plain_text.rb (modified) (2 diffs)
- branches/oo_indexer/lib/picolena/templates/lib/tasks/install_dependencies.rake (modified) (1 diff)
- branches/oo_indexer/lib/picolena/templates/spec/helpers/documents_helper_spec.rb (modified) (1 diff)
- branches/oo_indexer/lib/picolena/templates/spec/models/host_indexing_system_spec.rb (modified) (1 diff)
- branches/oo_indexer/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb (moved) (moved from branches/oo_indexer/lib/picolena/templates/spec/models/filters_spec.rb) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
branches/oo_indexer/README.txt
r107 r178 15 15 Picolena has many advantages: 16 16 17 * it can index .pdf, .doc, .docx, .odt, .xls, .ods, .ppt, .pptx, .odp, .rtf, .html and plain text files will full text search, and offers a very easy way to add new filters to index other filetype.17 * it can index .pdf, .doc, .docx, .odt, .xls, .ods, .ppt, .pptx, .odp, .rtf, .html and plain text files will full text search, and offers a very easy way to add new extractors to index other filetype. 18 18 * it is free as in free beer and as in free speech 19 19 * thanks to Ferret, it is very fast branches/oo_indexer/lib/picolena/templates/app/models/document.rb
r177 r178 50 50 end 51 51 52 # Returns true iff some Filter has been defined to convert it to plain text.52 # Returns true iff some PlainTextExtractor has been defined to convert it to plain text. 53 53 # Document.new("presentation.pdf").supported? => true 54 54 # Document.new("presentation.some_weird_extension").supported? => false 55 55 def supported? 56 Filter.supported_extensions.include?(self.ext_as_sym)56 PlainTextExtractor.supported_extensions.include?(self.ext_as_sym) 57 57 end 58 58 59 59 # Retrieves content as it is *now*. 60 60 def content 61 Filter.extract_content_from(complete_path)61 PlainTextExtractor.extract_content_from(complete_path) 62 62 end 63 63 … … 65 65 # Returns content as it was at the time it was indexed. 66 66 def cached 67 IndexReader.new[index_id][:content]67 from_index[:content] 68 68 end 69 69 branches/oo_indexer/lib/picolena/templates/app/models/indexer.rb
r177 r178 100 100 101 101 begin 102 text = Filter.extract_content_from(complete_path)102 text = PlainTextExtractor.extract_content_from(complete_path) 103 103 raise "\tempty document #{complete_path}" if text.strip.empty? 104 104 fields[:content] = text branches/oo_indexer/lib/picolena/templates/app/models/plain_text_extractor.rb
r177 r178 1 require ' filter_DSL'1 require 'plain_text_extractor_DSL' 2 2 3 class Filter4 include FilterDSL5 @@ filters=[]3 class PlainTextExtractor 4 include PlainTextExtractorDSL 5 @@extractors=[] 6 6 class<<self 7 # Returns every defined filter7 # Returns every defined extractor 8 8 def all 9 @@ filters9 @@extractors 10 10 end 11 11 12 # Add a filter to the filters list13 def add( filter)14 @@ filters<<filter12 # Add an extractor to the extractors list 13 def add(extractor) 14 @@extractors<<extractor 15 15 end 16 16 17 # Calls block for each filter17 # Calls block for each extractor 18 18 def each(&block) 19 19 all.each(&block) 20 20 end 21 21 22 # Returns every required dependency for every defined filter22 # Returns every required dependency for every defined extractor 23 23 def dependencies 24 @@dependencies||=all.collect{| filter| filter.dependencies}.flatten.compact.uniq.sort24 @@dependencies||=all.collect{|extractor| extractor.dependencies}.flatten.compact.uniq.sort 25 25 end 26 26 27 27 # Returns every supported file extensions 28 28 def supported_extensions 29 @@supported_exts||=all.collect{| filter| filter.exts}.flatten.compact.uniq29 @@supported_exts||=all.collect{|extractor| extractor.exts}.flatten.compact.uniq 30 30 end 31 31 32 # Finds which filter should be used for a given file, according to its extension32 # Finds which extractor should be used for a given file, according to its extension 33 33 # Raises if the file is unsupported. 34 34 def find_by_filename(filename) 35 35 ext=File.ext_as_sym(filename) 36 filter=all.find{|filter| filter.exts.include?(ext)} || raise(ArgumentError, "no convertor for #{filename}")37 filter.source=filename38 filter36 extractor=all.find{|extractor| extractor.exts.include?(ext)} || raise(ArgumentError, "no convertor for #{filename}") 37 extractor.source=filename 38 extractor 39 39 end 40 40 41 # Launches filter on given file and outputs plain text result41 # Launches extractor on given file and outputs plain text result 42 42 def extract_content_from(source) 43 43 find_by_filename(source).extract_content branches/oo_indexer/lib/picolena/templates/config/initializers/004_load_filters.rb
r170 r178 1 1 require 'core_exts' 2 require ' filter_DSL'2 require 'plain_text_extractor_DSL' 3 3 4 Dir.glob(File.join(RAILS_ROOT,'lib/ filters/*.rb')).each{|filter|5 require filter4 Dir.glob(File.join(RAILS_ROOT,'lib/plain_text_extractors/*.rb')).each{|extractor| 5 require extractor 6 6 } branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractor_DSL.rb
r177 r178 1 # Defines Filters with DSL1 # Defines plain text extractors with DSL 2 2 # For example, to convert "Microsoft Office Word document" to plain text 3 # Filter.new {3 # PlainTextExtractor.new { 4 4 # every :doc, :dot 5 5 # as "application/msword" … … 10 10 # } 11 11 12 module FilterDSL12 module PlainTextExtractorDSL 13 13 attr_reader :exts, :mime_name, :description, :command, :content_and_file_examples 14 14 … … 16 16 @content_and_file_examples=[] 17 17 self.instance_eval(&block) 18 Filter.add(self)18 PlainTextExtractor.add(self) 19 19 MimeType.add(self.exts,self.mime_name) 20 20 end … … 36 36 end 37 37 38 #used by rspec to test filters:38 #used by rspec to test extractors: 39 39 # which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf' 40 40 # or_extract 'some other stuff inside another pdf file', :from => 'yet_another.pdf' 41 41 # 42 42 #this spec will pass if 'basic.pdf' and 'yet_another.pdf' are included in an indexed directory, if every dependency is installed, 43 #and if plain text output from the filter applied to 'basic.pdf' and 'yet_another.pdf' respectively include 'in a pdf file' and 'some other stuff inside another pdf file'43 #and if plain text output from the extractor applied to 'basic.pdf' and 'yet_another.pdf' respectively include 'in a pdf file' and 'some other stuff inside another pdf file' 44 44 def which_should_for_example_extract(content, file) 45 45 @content_and_file_examples << [content,file[:from]] … … 66 66 command_as_hash_or_string.invert[platform].dup 67 67 else 68 block || raise("No command defined for this filter: #{description}")68 block || raise("No command defined for this extractor: #{description}") 69 69 end 70 70 @command<<' 2>/dev/null' if (@command.is_a?(String) && platform==:on_linux && !@command.include?('|')) branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
r173 r178 5 5 # Home page: http://www.foolabs.com/xpdf/ 6 6 7 Filter.new {7 PlainTextExtractor.new { 8 8 every :pdf 9 9 as "application/pdf" branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/html.rb
r173 r178 1 Filter.new {1 PlainTextExtractor.new { 2 2 every :html, :htm 3 3 as "text/html" branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb
r173 r178 1 1 #Excel 97-2003 2 2 3 Filter.new {3 PlainTextExtractor.new { 4 4 every :xls 5 5 as "application/excel" … … 12 12 13 13 require 'zip/zip' 14 Filter.new {14 PlainTextExtractor.new { 15 15 every :xlsx 16 16 as 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb
r173 r178 1 1 #Powerpoint 97-2003 2 2 3 Filter.new {3 PlainTextExtractor.new { 4 4 every :ppt, :pps 5 5 as "application/powerpoint" … … 14 14 15 15 require 'zip/zip' 16 Filter.new {16 PlainTextExtractor.new { 17 17 every :pptx 18 18 as 'application/vnd.openxmlformats-officedocument.presentationml.presentation' #could that mime BE any longer? branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb
r173 r178 5 5 # http://www.gnu.org/software/unrtf/unrtf.html 6 6 7 Filter.new {7 PlainTextExtractor.new { 8 8 every :rtf 9 9 as "application/rtf" branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/ms.word.rb
r173 r178 1 1 #Word 97-2003 2 2 3 Filter.new {3 PlainTextExtractor.new { 4 4 every :doc, :dot 5 5 as "application/msword" … … 13 13 14 14 require 'zip/zip' 15 Filter.new {15 PlainTextExtractor.new { 16 16 every :docx, :dotx 17 17 as 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/opendocument.presentation.rb
r173 r178 2 2 3 3 require 'zip/zip' 4 Filter.new {4 PlainTextExtractor.new { 5 5 every :odp 6 6 as 'application/vnd.oasis.opendocument.presentation' branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb
r173 r178 2 2 3 3 require 'zip/zip' 4 Filter.new {4 PlainTextExtractor.new { 5 5 every :ods 6 6 as 'application/vnd.oasis.opendocument.spreadsheet' branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb
r173 r178 2 2 3 3 require 'zip/zip' 4 Filter.new {4 PlainTextExtractor.new { 5 5 every :odt 6 6 as 'application/vnd.oasis.opendocument.text' branches/oo_indexer/lib/picolena/templates/lib/plain_text_extractors/plain_text.rb
r173 r178 1 Filter.new {1 PlainTextExtractor.new { 2 2 every :txt, :text, :tex, :for, :cpp, :c, :rb, :ins, :vee, :java, :no_extension 3 3 as "application/plain" … … 15 15 which_requires 'iconv' 16 16 17 # to check if filter is working with basic plain text files17 # to check if the extractor is working with basic plain text files 18 18 which_should_for_example_extract 'Hello world!', :from => 'hello.rb' 19 19 or_extract 'text inside!', :from => 'crossed.txt' branches/oo_indexer/lib/picolena/templates/lib/tasks/install_dependencies.rake
r170 r178 30 30 task :deb_packages do 31 31 root_privileges_required! 32 #TODO: Should load this list from defined Filters32 #TODO: Should load this list from defined PlainTextExtractor's 33 33 packages=%w{antiword poppler-utils odt2txt html2text catdoc unrtf}.join(" ") 34 34 puts "Installing "<<packages branches/oo_indexer/lib/picolena/templates/spec/helpers/documents_helper_spec.rb
r170 r178 4 4 it "shouldn't raise if matching not in content field" 5 5 6 Filter.supported_extensions.each{|ext|6 PlainTextExtractor.supported_extensions.each{|ext| 7 7 it "should have an icon for .#{ext} filetype" do 8 8 icon_for(ext.to_s).should_not be_nil branches/oo_indexer/lib/picolena/templates/spec/models/host_indexing_system_spec.rb
r170 r178 2 2 3 3 describe "Host indexing system" do 4 Filter.dependencies.each do |dependency|4 PlainTextExtractor.dependencies.each do |dependency| 5 5 it "should have #{dependency} installed" do 6 6 IO.popen("which #{dependency}"){|i| i.read.should_not be_empty} branches/oo_indexer/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb
r170 r178 1 1 require File.dirname(__FILE__) + '/../spec_helper' 2 2 3 describe " Filters" do3 describe "PlainTextExtractors" do 4 4 before(:all) do 5 5 IndexReader.ensure_existence 6 6 end 7 7 8 Filter.each{|filter|9 filter.exts.each{|ext|10 should_extract= "should be able to extract content from #{ filter.description} (.#{ext})"11 content_and_file_examples_for_this_ext= filter.content_and_file_examples.select{|content,file| File.ext_as_sym(file)==ext}8 PlainTextExtractor.each{|extractor| 9 extractor.exts.each{|ext| 10 should_extract= "should be able to extract content from #{extractor.description} (.#{ext})" 11 content_and_file_examples_for_this_ext=extractor.content_and_file_examples.select{|content,file| File.ext_as_sym(file)==ext} 12 12 unless content_and_file_examples_for_this_ext.empty? then 13 13 it should_extract do … … 22 22 else 23 23 ## It means that the spec for this extension file is "Not yet implemented"! 24 ## add this line to the corresponding filter in lib/filters:24 ## add this line to the corresponding extractor in lib/extractors: 25 25 # which_should_for_example_extract 'some content', :from => 'a file you could add in spec/test_dirs/indexed/' 26 26 it should_extract
