Changeset 238
- Timestamp:
- 04/24/08 06:21:11 (7 months ago)
- Files:
-
- branches/yet_another_index_structure/lib/picolena/templates/app/models/document.rb (modified) (1 diff)
- branches/yet_another_index_structure/lib/picolena/templates/app/models/finder.rb (modified) (2 diffs)
- branches/yet_another_index_structure/lib/picolena/templates/app/models/indexer.rb (modified) (5 diffs)
- branches/yet_another_index_structure/lib/picolena/templates/lib/tasks/index.rake (modified) (1 diff)
- branches/yet_another_index_structure/lib/picolena/templates/spec/models/basic_finder_spec.rb (modified) (3 diffs)
- branches/yet_another_index_structure/lib/picolena/templates/spec/models/finder_spec.rb (modified) (2 diffs)
- branches/yet_another_index_structure/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
branches/yet_another_index_structure/lib/picolena/templates/app/models/document.rb
r236 r238 106 106 # Useful to get meta-info about it. 107 107 def from_index 108 Index Reader.new[index_id]108 Indexer.index[index_id] 109 109 end 110 110 111 111 def self.find_by_unique_id(some_id) 112 #NOTE: No need for Finder. 112 113 Finder.new("probably_unique_id:"<<some_id).matching_document 113 114 end 114 115 115 116 def self.find_by_complete_path(complete_path) 117 #NOTE: No need for Finder. 116 118 Finder.new('complete_path:"'<<complete_path<<'"').matching_document 117 119 end branches/yet_another_index_structure/lib/picolena/templates/app/models/finder.rb
r237 r238 3 3 4 4 def index 5 5 @@index ||= Indexer.index 6 6 end 7 7 … … 9 9 @query = Query.extract_from(raw_query) 10 10 @raw_query= raw_query 11 Index .ensure_existence11 Indexer.ensure_index_existence 12 12 @per_page=results_per_page 13 13 @offset=(page.to_i-1)*results_per_page 14 index.should_have_documents15 14 end 16 15 branches/yet_another_index_structure/lib/picolena/templates/app/models/indexer.rb
r237 r238 7 7 class << self 8 8 def index_every_directory(remove_first=false) 9 clear! if remove_first 9 10 log :debug => "Indexing every directory" 10 11 start=Time.now … … 12 13 index_directory_with_multithreads(dir) 13 14 } 15 log :debug => "Now optimizing index" 16 writer.optimize 14 17 log :debug => "Indexing done in #{Time.now-start} s." 15 18 end … … 27 30 indexing_list_chunks.each_with_thread{|chunk| 28 31 chunk.each{|filename| 29 add (filename)32 add_file(filename) 30 33 } 31 34 } … … 33 36 34 37 def add_file(complete_path) 35 log :debug => "Adding #{complete_path}"36 38 default_fields = Document.default_fields_for(complete_path) 37 39 begin 38 40 document = PlainTextExtractor.extract_content_and_language_from(complete_path) 39 raise "\tempty document #{complete_path}" if document[:content].strip.empty? 40 log :debug => "language found: #{document[:language]}" if document[:language] 41 raise "empty document #{complete_path}" if document[:content].strip.empty? 41 42 document.merge! default_fields 43 log :debug => ["Added : #{complete_path}",document[:language] ? " (#{document[:language]})" : ""].join 42 44 rescue => e 43 45 log :debug => "\tindexing without content: #{e.message}" … … 46 48 end 47 49 48 def clear_all 49 50 # Ensures writer is closed, and removes every index file for RAILS_ENV. 51 def clear!(all=false) 52 close 53 to_remove=all ? Picolena::IndexesSavePath : Picolena::IndexSavePath 54 Dir.glob(File.join(to_remove,'**/*')).each{|f| FileUtils.rm(f) if File.file?(f)} 50 55 end 51 56 57 # Closes the writer and 58 # ensures that a new IndexWriter is instantiated next time writer is called. 59 def close 60 writer.close 61 # Ferret will SEGFAULT otherwise. 62 @@writer = nil 63 end 64 65 # Only one IndexWriter should be instantiated. 66 # If one already exists, returns it. 67 # Creates it otherwise. 52 68 def writer 53 @@writer ||= [] 69 @@writer ||= Ferret::Index::IndexWriter.new(default_index_writer_params) 70 end 71 72 def index 73 Ferret::I.new(default_index_params) 74 end 75 76 def ensure_index_existence 77 index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production" 54 78 end 55 79 56 80 private 57 81 82 def index_exists? 83 index_filename and File.exists?(index_filename) 84 end 85 86 def index_filename 87 Dir.glob(File.join(Picolena::IndexSavePath,'*.cfs')).first 88 end 89 58 90 def log(hash) 59 91 hash.each{|level,message| 60 92 IndexerLogger.send(level,message) 61 93 } 62 end 94 end 95 96 def default_index_params 97 {:path => Picolena::IndexSavePath, :analyzer => Picolena::Analyzer} 98 end 99 100 def default_field_infos 101 returning Ferret::Index::FieldInfos.new do |field_infos| 102 field_infos.add_field(:complete_path, :store => :yes, :index => :untokenized) 103 field_infos.add_field(:content, :store => :yes, :index => :yes) 104 field_infos.add_field(:basename, :store => :no, :index => :yes, :boost => 1.5) 105 field_infos.add_field(:filename, :store => :no, :index => :yes, :boost => 1.5) 106 field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5) 107 field_infos.add_field(:modified, :store => :yes, :index => :untokenized) 108 field_infos.add_field(:probably_unique_id, :store => :no, :index => :yes) 109 field_infos.add_field(:lang, :store => :yes, :index => :yes) 110 end 111 end 112 113 def default_index_writer_params 114 default_index_params.merge :field_infos => default_field_infos 115 end 63 116 end 64 117 end branches/yet_another_index_structure/lib/picolena/templates/lib/tasks/index.rake
r235 r238 3 3 desc 'Clear indexes' 4 4 task :clear => :environment do 5 Indexer.clear _all5 Indexer.clear! :all 6 6 end 7 7 branches/yet_another_index_structure/lib/picolena/templates/spec/models/basic_finder_spec.rb
r167 r238 11 11 12 12 before(:each) do 13 Index Writer.remove13 Indexer.clear! 14 14 end 15 15 16 16 it "should create index" do 17 17 Picolena::IndexedDirectories.replace({'spec/test_dirs/indexed/just_one_doc'=>'//justonedoc/'}) 18 lambda {@finder_with_new_index=Finder.new("test moi")}.should change(Index Reader, :exists?).from(false).to(true)18 lambda {@finder_with_new_index=Finder.new("test moi")}.should change(Indexer, :index_exists?).from(false).to(true) 19 19 File.exists?(File.join(@new_index_path,'_0.cfs')).should be_true 20 Index Reader.new.size.should >020 Indexer.index.size.should >0 21 21 end 22 22 … … 45 45 describe "Basic Finder" do 46 46 before(:all) do 47 Indexer.index_every_directory( update=false)47 Indexer.index_every_directory(remove_first=true) 48 48 end 49 49 … … 83 83 fields.each_pair do |description,field_name| 84 84 it "should index #{description} as :#{field_name}" do 85 Index Reader.new.field_infos[field_name].should be_an_instance_of(Ferret::Index::FieldInfo)85 Indexer.index.field_infos[field_name].should be_an_instance_of(Ferret::Index::FieldInfo) 86 86 end 87 87 end branches/yet_another_index_structure/lib/picolena/templates/spec/models/finder_spec.rb
r217 r238 22 22 File.utime(0, a_bit_later, 'spec/test_dirs/indexed/yet_another_dir/office2003-word-template.dot') 23 23 File.utime(0, nineties, 'spec/test_dirs/indexed/others/placeholder.txt') 24 Indexer.index_every_directory( update=false)24 Indexer.index_every_directory(remove_first=true) 25 25 end 26 26 … … 48 48 49 49 it "should give a boost to basename, filename and filetype in index" do 50 index=Index Reader.new50 index=Indexer.index 51 51 index.field_infos[:basename].boost.should > 1.0 52 52 index.field_infos[:file].boost.should > 1.0 branches/yet_another_index_structure/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb
r214 r238 3 3 describe "PlainTextExtractors" do 4 4 before(:all) do 5 Index Reader.ensure_existence5 Indexer.ensure_index_existence 6 6 end 7 7
