Changeset 177
- Timestamp:
- 04/20/08 01:50:58 (7 months ago)
- Files:
-
- branches/oo_indexer/lib/picolena/config/basic.rb (modified) (1 diff)
- branches/oo_indexer/lib/picolena/templates/app/helpers/documents_helper.rb (modified) (1 diff)
- branches/oo_indexer/lib/picolena/templates/app/models/document.rb (modified) (3 diffs)
- branches/oo_indexer/lib/picolena/templates/app/models/filter.rb (modified) (1 diff)
- branches/oo_indexer/lib/picolena/templates/app/models/finder.rb (modified) (1 diff)
- branches/oo_indexer/lib/picolena/templates/lib/filter_DSL.rb (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
branches/oo_indexer/lib/picolena/config/basic.rb
r167 r177 1 1 module Picolena 2 # Specify indexes path.3 # Storage should be sufficient in order to store all indexed data.4 IndexesSavePath=File.join(RAILS_ROOT, 'tmp/ferret_indexes/')5 6 7 # Which language should be used?8 # English (:en), German (:de), French (:fr) and Spanish (:es) are currently supported9 # English is chosen by default.10 # If you'd like to use another language, you can find templates in #{RAILS_ROOT}/lang/ui,11 # then add your own language in this directory, and modify this line:12 Globalite.language = :en13 14 15 # Specify which locale should be used by Ferret16 Ferret.locale = "en_US.UTF-8"17 18 19 # Results per page20 ResultsPerPage = 1021 22 23 # Length of "probably unique id" 's24 # Those id's are used to characterize every document, thus allowing tiny URLs in Controllers25 # HashLength = 1026 # Document.new("whatever.pdf").probably_unique_id => "bbuxhynait"27 # HashLength = 2028 # Document.new("whatever.pdf").probably_unique_id => "jfzjkyfkfkbbuxhynait"29 # The more documents you have, the bigger HashLength should be in order to avoid collisions.30 # It would not be wise (and specs won't pass) to specify HashLength smaller than 10.31 HashLength = 1032 33 34 # Specify the default Levenshtein distance when using FuzzyQuery35 # see http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html for more information.36 Ferret::Search::FuzzyQuery.default_min_similarity=0.637 Analyzer=Ferret::Analysis::StandardAnalyzer.new2 # Specify indexes path. 3 # Storage should be sufficient in order to store all indexed data. 4 IndexesSavePath=File.join(RAILS_ROOT, 'tmp/ferret_indexes/') 5 6 7 # Which language should be used? 8 # English (:en), German (:de), French (:fr) and Spanish (:es) are currently supported 9 # English is chosen by default. 10 # If you'd like to use another language, you can find templates in #{RAILS_ROOT}/lang/ui, 11 # then add your own language in this directory, and modify this line: 12 Globalite.language = :en 13 14 15 # Specify which locale should be used by Ferret 16 Ferret.locale = "en_US.UTF-8" 17 18 19 # Results per page 20 ResultsPerPage = 10 21 22 23 # Length of "probably unique id" 's 24 # Those id's are used to characterize every document, thus allowing tiny URLs in Controllers 25 # HashLength = 10 26 # Document.new("whatever.pdf").probably_unique_id => "bbuxhynait" 27 # HashLength = 20 28 # Document.new("whatever.pdf").probably_unique_id => "jfzjkyfkfkbbuxhynait" 29 # The more documents you have, the bigger HashLength should be in order to avoid collisions. 30 # It would not be wise (and specs won't pass) to specify HashLength smaller than 10. 31 HashLength = 10 32 33 34 # Specify the default Levenshtein distance when using FuzzyQuery 35 # see http://ferret.davebalmain.com/api/classes/Ferret/QueryParser.html for more information. 36 Ferret::Search::FuzzyQuery.default_min_similarity=0.6 37 Analyzer=Ferret::Analysis::StandardAnalyzer.new 38 38 end branches/oo_indexer/lib/picolena/templates/app/helpers/documents_helper.rb
r166 r177 4 4 @matching_documents.nil? or @matching_documents.entries.empty? 5 5 end 6 6 7 7 # Very basic pagination. 8 8 # Provides liks to Next, Prev and FirstPage when needed. 9 9 def should_paginate(page,query) 10 [(link_to("←←", :action => :show, :id => query, :page => 1) if page.number>2),11 (link_to("←", :action => :show, :id => query, :page => page.prev.number) if page.prev?),12 (link_to("→", :action => :show, :id => query, :page => page.next.number) if page.next?)].compact.join(" | ")10 [(link_to("←←", :action => :show, :id => query, :page => 1) if page.number>2), 11 (link_to("←", :action => :show, :id => query, :page => page.prev.number) if page.prev?), 12 (link_to("→", :action => :show, :id => query, :page => page.next.number) if page.next?)].compact.join(" | ") 13 13 end 14 14 branches/oo_indexer/lib/picolena/templates/app/models/document.rb
r176 r177 67 67 IndexReader.new[index_id][:content] 68 68 end 69 69 70 70 # FIXME: Not just date anymore. 71 71 # Returns the last modification date before the document got indexed. … … 78 78 from_index[:date].to_i 79 79 end 80 80 81 81 # Returns the id with which the document is indexed. 82 82 def index_id … … 85 85 86 86 private 87 87 88 88 # Retrieves the document from the index. 89 89 # Useful to get meta-info about it. branches/oo_indexer/lib/picolena/templates/app/models/filter.rb
r174 r177 5 5 @@filters=[] 6 6 class<<self 7 # Returns every defined filter 8 def all 9 @@filters 7 # Returns every defined filter 8 def all 9 @@filters 10 end 11 12 # Add a filter to the filters list 13 def add(filter) 14 @@filters<<filter 15 end 16 17 # Calls block for each filter 18 def each(&block) 19 all.each(&block) 20 end 21 22 # Returns every required dependency for every defined filter 23 def dependencies 24 @@dependencies||=all.collect{|filter| filter.dependencies}.flatten.compact.uniq.sort 25 end 26 27 # Returns every supported file extensions 28 def supported_extensions 29 @@supported_exts||=all.collect{|filter| filter.exts}.flatten.compact.uniq 30 end 31 32 # Finds which filter should be used for a given file, according to its extension 33 # Raises if the file is unsupported. 34 def find_by_filename(filename) 35 ext=File.ext_as_sym(filename) 36 filter=all.find{|filter| filter.exts.include?(ext)} || raise(ArgumentError, "no convertor for #{filename}") 37 filter.source=filename 38 filter 39 end 40 41 # Launches filter on given file and outputs plain text result 42 def extract_content_from(source) 43 find_by_filename(source).extract_content 44 end 45 end 46 47 attr_accessor :source 48 49 # Parses command in order to know which programs are needed. 50 # rspec will then check that every dependecy is installed on the system 51 def dependencies 52 if command.is_a?(String) then 53 command.split(/\|\s*/).collect{|command_part| command_part.split(/ /).first} 54 else 55 @dependencies 56 end 10 57 end 11 58 12 # Add a filter to the filters list 13 def add(filter) 14 @@filters<<filter 15 end 16 17 # Calls block for each filter 18 def each(&block) 19 all.each(&block) 59 ## Conversion part 60 61 # destination method can be used by some conversion command that cannot output to stdout (example?) 62 # a file containing plain text result will first be written by command, and then be read by extract_content. 63 def destination 64 require 'tmpdir' 65 @@temp_file_as_destination ||= File.join(Dir::tmpdir,"ferret_#{Time.now.to_i}") 20 66 end 21 67 22 # Re turns every required dependency for every defined filter23 def dependencies24 @@dependencies||=all.collect{|filter| filter.dependencies}.flatten.compact.uniq.sort68 # Replaces generic command with specific source and destination (if specified) files 69 def specific_command 70 command.sub('SOURCE','"'<<source<<'"').sub('DESTINATION','"'<<destination<<'"') 25 71 end 26 72 27 # Returns every supported file extensions 28 def supported_extensions 29 @@supported_exts||=all.collect{|filter| filter.exts}.flatten.compact.uniq 30 end 31 32 # Finds which filter should be used for a given file, according to its extension 33 # Raises if the file is unsupported. 34 def find_by_filename(filename) 35 ext=File.ext_as_sym(filename) 36 filter=all.find{|filter| filter.exts.include?(ext)} || raise(ArgumentError, "no convertor for #{filename}") 37 filter.source=filename 38 filter 39 end 40 41 # Launches filter on given file and outputs plain text result 42 def extract_content_from(source) 43 find_by_filename(source).extract_content 44 end 45 end 46 47 48 49 attr_accessor :source 50 51 # Parses command in order to know which programs are needed. 52 # rspec will then check that every dependecy is installed on the system 53 def dependencies 54 if command.is_a?(String) then 55 command.split(/\|\s*/).collect{|command_part| command_part.split(/ /).first} 73 # Returns plain text content of source file 74 def extract_content 75 if command.is_a?(String) then 76 # If command is a String, launch it via system(command). 77 if command.include?('DESTINATION') then 78 # If command includes 'DESTINATION' keyword, 79 # launches the command and returns the content of 80 # DESTINATION file. 81 system(specific_command) 82 File.read_and_remove(destination) 56 83 else 57 @dependencies 84 # Otherwise, launches the command and returns STDOUT. 85 IO.popen(specific_command){|io| io.read} 58 86 end 87 else 88 # command is a Block. 89 # Returns the result of command.call, 90 # with source file as parameter. 91 command.call(source) 59 92 end 60 61 ## Conversion part 62 63 # destination method can be used by some conversion command that cannot output to stdout (example?) 64 # a file containing plain text result will first be written by command, and then be read by extract_content. 65 def destination 66 require 'tmpdir' 67 @@temp_file_as_destination ||= File.join(Dir::tmpdir,"ferret_#{Time.now.to_i}") 68 end 69 70 # Replaces generic command with specific source and destination (if specified) files 71 def specific_command 72 command.sub('SOURCE','"'<<source<<'"').sub('DESTINATION','"'<<destination<<'"') 73 end 74 75 # Returns plain text content of source file 76 def extract_content 77 if command.is_a?(String) then 78 if command.include?('DESTINATION') then 79 system(specific_command) 80 File.read_and_remove(destination) 81 else 82 IO.popen(specific_command){|io| io.read} 83 end 84 else 85 command.call(source) 86 end 87 end 93 end 88 94 end branches/oo_indexer/lib/picolena/templates/app/models/finder.rb
r167 r177 26 26 found_doc=Document.new(index[index_id][:complete_path]) 27 27 found_doc.matching_content=index.highlight(query, index_id, 28 :field => :content, :excerpt_length => 80,29 :pre_tag => "<<", :post_tag => ">>"28 :field => :content, :excerpt_length => 80, 29 :pre_tag => "<<", :post_tag => ">>" 30 30 ) unless @raw_query=~/^\*+\.\w*$/ 31 31 found_doc.score=score branches/oo_indexer/lib/picolena/templates/lib/filter_DSL.rb
r173 r177 1 1 # Defines Filters with DSL 2 2 # For example, to convert "Microsoft Office Word document" to plain text 3 # Filter.new { 4 # every :doc, :dot 5 # as "application/msword" 6 # aka "Microsoft Office Word document" 7 # with "antiword SOURCE > DESTINATION 2>/dev/null" => :on_linux, "some other command" => :on_windows 8 # which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc' 9 # } 3 # Filter.new { 4 # every :doc, :dot 5 # as "application/msword" 6 # aka "Microsoft Office Word document" 7 # with "antiword SOURCE" => :on_linux, "some other command" => :on_windows 8 # which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc' 9 # or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot' 10 # } 10 11 11 12 module FilterDSL 12 attr_reader :exts, :mime_name, :description, :command, :content_and_file_examples 13 14 def initialize(&block) 15 @content_and_file_examples=[] 16 self.instance_eval(&block) 17 Filter.add(self) 18 MimeType.add(self.exts,self.mime_name) 13 attr_reader :exts, :mime_name, :description, :command, :content_and_file_examples 14 15 def initialize(&block) 16 @content_and_file_examples=[] 17 self.instance_eval(&block) 18 Filter.add(self) 19 MimeType.add(self.exts,self.mime_name) 20 end 21 22 def every(*exts) 23 @exts=exts 24 end 25 26 def as(mime_name) 27 @mime_name=mime_name 28 end 29 30 def aka(description) 31 @description=description 32 end 33 34 def which_requires(*dependencies) 35 @dependencies=dependencies 36 end 37 38 #used by rspec to test filters: 39 # which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf' 40 # or_extract 'some other stuff inside another pdf file', :from => 'yet_another.pdf' 41 # 42 #this spec will pass if 'basic.pdf' and 'yet_another.pdf' are included in an indexed directory, if every dependency is installed, 43 #and if plain text output from the filter applied to 'basic.pdf' and 'yet_another.pdf' respectively include 'in a pdf file' and 'some other stuff inside another pdf file' 44 def which_should_for_example_extract(content, file) 45 @content_and_file_examples << [content,file[:from]] 46 end 47 48 #it allows to define specs in this way: 49 # which_should_for_example_extract 'Hello world!', :from => 'hello.rb' 50 # or_extract 'text inside!', :from => 'crossed.txt' 51 alias_method :or_extract, :which_should_for_example_extract 52 53 def with(command_as_hash_or_string=nil,&block) 54 #TODO: Find a better way to manage platforms, and include OS X, Vista, BSD... 55 platform=case RUBY_PLATFORM 56 when /linux/ 57 :on_linux 58 when /win/ 59 :on_windows 19 60 end 20 21 def every(*exts) 22 @exts=exts 61 @command=case command_as_hash_or_string 62 when String 63 command_as_hash_or_string 64 when Hash 65 #dup must be used, otherwise @command gets frozen. No idea why though.... 66 command_as_hash_or_string.invert[platform].dup 67 else 68 block || raise("No command defined for this filter: #{description}") 23 69 end 24 25 def as(mime_name) 26 @mime_name=mime_name 27 end 28 29 def aka(description) 30 @description=description 31 end 32 33 def which_requires(*dependencies) 34 @dependencies=dependencies 35 end 36 37 #used by rspec to test filters: 38 # which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf' 39 # or_extract 'some other stuff inside another pdf file', :from => 'yet_another.pdf' 40 # 41 #this spec will pass if 'basic.pdf' and 'yet_another.pdf' are included in an indexed directory, if every dependency is installed, 42 #and if plain text output from the filter applied to 'basic.pdf' and 'yet_another.pdf' respectively include 'in a pdf file' and 'some other stuff inside another pdf file' 43 def which_should_for_example_extract(content, file) 44 @content_and_file_examples << [content,file[:from]] 45 end 46 47 #it allows to define specs in this way: 48 # which_should_for_example_extract 'Hello world!', :from => 'hello.rb' 49 # or_extract 'text inside!', :from => 'crossed.txt' 50 alias_method :or_extract, :which_should_for_example_extract 51 52 def with(command_as_hash_or_string=nil,&block) 53 platform=case RUBY_PLATFORM 54 when /linux/ 55 :on_linux 56 when /win/ 57 :on_windows 58 end 59 @command=case command_as_hash_or_string 60 when String 61 command_as_hash_or_string 62 when Hash 63 #dup must be used, otherwise @command gets frozen. No idea why though.... 64 command_as_hash_or_string.invert[platform].dup 65 else 66 block || raise("No command defined for this filter: #{description}") 67 end 68 @command<<' 2>/dev/null' if (@command.is_a?(String) && platform==:on_linux && !@command.include?('|')) 69 end 70 @command<<' 2>/dev/null' if (@command.is_a?(String) && platform==:on_linux && !@command.include?('|')) 71 end 70 72 end
