Changeset 238

Show
Ignore:
Timestamp:
04/24/08 06:21:11 (7 months ago)
Author:
eric.dumin..@gmail.com
Message:

Lots of refactoring.
146 examples, 17 failures, 6 pending

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • branches/yet_another_index_structure/lib/picolena/templates/app/models/document.rb

    r236 r238  
    106106  # Useful to get meta-info about it. 
    107107  def from_index 
    108     IndexReader.new[index_id] 
     108    Indexer.index[index_id] 
    109109  end 
    110110   
    111111  def self.find_by_unique_id(some_id) 
     112    #NOTE: No need for Finder. 
    112113    Finder.new("probably_unique_id:"<<some_id).matching_document 
    113114  end 
    114115   
    115116  def self.find_by_complete_path(complete_path) 
     117    #NOTE: No need for Finder. 
    116118    Finder.new('complete_path:"'<<complete_path<<'"').matching_document 
    117119  end 
  • branches/yet_another_index_structure/lib/picolena/templates/app/models/finder.rb

    r237 r238  
    33   
    44  def index 
    5       
     5     @@index ||= Indexer.index 
    66  end 
    77   
     
    99    @query = Query.extract_from(raw_query) 
    1010    @raw_query= raw_query 
    11     Index.ensure_existence 
     11    Indexer.ensure_index_existence 
    1212    @per_page=results_per_page 
    1313    @offset=(page.to_i-1)*results_per_page 
    14     index.should_have_documents 
    1514  end 
    1615   
  • branches/yet_another_index_structure/lib/picolena/templates/app/models/indexer.rb

    r237 r238  
    77  class << self     
    88    def index_every_directory(remove_first=false) 
     9      clear! if remove_first 
    910      log :debug => "Indexing every directory" 
    1011      start=Time.now 
     
    1213        index_directory_with_multithreads(dir) 
    1314      } 
     15      log :debug => "Now optimizing index" 
     16      writer.optimize 
    1417      log :debug => "Indexing done in #{Time.now-start} s." 
    1518    end 
     
    2730      indexing_list_chunks.each_with_thread{|chunk| 
    2831        chunk.each{|filename| 
    29           add(filename) 
     32          add_file(filename) 
    3033        } 
    3134      } 
     
    3336     
    3437    def add_file(complete_path) 
    35       log :debug => "Adding #{complete_path}" 
    3638      default_fields = Document.default_fields_for(complete_path) 
    3739      begin 
    3840        document = PlainTextExtractor.extract_content_and_language_from(complete_path) 
    39         raise "\tempty document #{complete_path}" if document[:content].strip.empty? 
    40         log :debug => "language found: #{document[:language]}" if document[:language] 
     41        raise "empty document #{complete_path}" if document[:content].strip.empty? 
    4142        document.merge! default_fields 
     43        log :debug => ["Added : #{complete_path}",document[:language] ? " (#{document[:language]})" : ""].join 
    4244      rescue => e 
    4345        log :debug => "\tindexing without content: #{e.message}" 
     
    4648    end 
    4749     
    48     def clear_all 
    49        
     50    # Ensures writer is closed, and removes every index file for RAILS_ENV. 
     51    def clear!(all=false) 
     52      close 
     53      to_remove=all ? Picolena::IndexesSavePath : Picolena::IndexSavePath 
     54      Dir.glob(File.join(to_remove,'**/*')).each{|f| FileUtils.rm(f) if File.file?(f)} 
    5055    end 
    5156     
     57    # Closes the writer and 
     58    # ensures that a new IndexWriter is instantiated next time writer is called. 
     59    def close 
     60      writer.close 
     61      # Ferret will SEGFAULT otherwise. 
     62      @@writer = nil 
     63    end 
     64     
     65    # Only one IndexWriter should be instantiated. 
     66    # If one already exists, returns it. 
     67    # Creates it otherwise. 
    5268    def writer 
    53       @@writer ||= [] 
     69      @@writer ||= Ferret::Index::IndexWriter.new(default_index_writer_params) 
     70    end 
     71     
     72    def index 
     73      Ferret::I.new(default_index_params)   
     74    end 
     75     
     76    def ensure_index_existence 
     77      index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production" 
    5478    end 
    5579     
    5680    private 
    5781     
     82    def index_exists? 
     83      index_filename and File.exists?(index_filename) 
     84    end 
     85     
     86    def index_filename 
     87      Dir.glob(File.join(Picolena::IndexSavePath,'*.cfs')).first 
     88    end 
     89 
    5890    def log(hash) 
    5991      hash.each{|level,message| 
    6092        IndexerLogger.send(level,message) 
    6193      } 
    62     end   
     94    end 
     95     
     96    def default_index_params 
     97      {:path => Picolena::IndexSavePath, :analyzer => Picolena::Analyzer} 
     98    end 
     99     
     100    def default_field_infos 
     101      returning Ferret::Index::FieldInfos.new do |field_infos| 
     102        field_infos.add_field(:complete_path,      :store => :yes, :index => :untokenized) 
     103        field_infos.add_field(:content,            :store => :yes, :index => :yes) 
     104        field_infos.add_field(:basename,           :store => :no,  :index => :yes, :boost => 1.5) 
     105        field_infos.add_field(:filename,           :store => :no,  :index => :yes, :boost => 1.5) 
     106        field_infos.add_field(:filetype,           :store => :no,  :index => :yes, :boost => 1.5) 
     107        field_infos.add_field(:modified,           :store => :yes, :index => :untokenized) 
     108        field_infos.add_field(:probably_unique_id, :store => :no,  :index => :yes) 
     109        field_infos.add_field(:lang,               :store => :yes, :index => :yes) 
     110      end 
     111    end 
     112     
     113    def default_index_writer_params 
     114      default_index_params.merge :field_infos => default_field_infos 
     115    end 
    63116  end 
    64117end 
  • branches/yet_another_index_structure/lib/picolena/templates/lib/tasks/index.rake

    r235 r238  
    33  desc 'Clear indexes' 
    44  task :clear => :environment do 
    5     Indexer.clear_all 
     5    Indexer.clear! :all 
    66  end 
    77   
  • branches/yet_another_index_structure/lib/picolena/templates/spec/models/basic_finder_spec.rb

    r167 r238  
    1111   
    1212  before(:each) do 
    13     IndexWriter.remove 
     13    Indexer.clear! 
    1414  end 
    1515   
    1616  it "should create index" do 
    1717    Picolena::IndexedDirectories.replace({'spec/test_dirs/indexed/just_one_doc'=>'//justonedoc/'}) 
    18     lambda {@finder_with_new_index=Finder.new("test moi")}.should change(IndexReader, :exists?).from(false).to(true) 
     18    lambda {@finder_with_new_index=Finder.new("test moi")}.should change(Indexer, :index_exists?).from(false).to(true) 
    1919    File.exists?(File.join(@new_index_path,'_0.cfs')).should be_true 
    20     IndexReader.new.size.should >0 
     20    Indexer.index.size.should >0 
    2121  end 
    2222   
     
    4545describe "Basic Finder" do   
    4646  before(:all) do 
    47     Indexer.index_every_directory(update=false) 
     47    Indexer.index_every_directory(remove_first=true) 
    4848  end 
    4949   
     
    8383  fields.each_pair do |description,field_name| 
    8484    it "should index #{description} as :#{field_name}" do 
    85       IndexReader.new.field_infos[field_name].should be_an_instance_of(Ferret::Index::FieldInfo) 
     85      Indexer.index.field_infos[field_name].should be_an_instance_of(Ferret::Index::FieldInfo) 
    8686    end 
    8787  end 
  • branches/yet_another_index_structure/lib/picolena/templates/spec/models/finder_spec.rb

    r217 r238  
    2222    File.utime(0, a_bit_later, 'spec/test_dirs/indexed/yet_another_dir/office2003-word-template.dot') 
    2323    File.utime(0, nineties, 'spec/test_dirs/indexed/others/placeholder.txt') 
    24     Indexer.index_every_directory(update=false) 
     24    Indexer.index_every_directory(remove_first=true) 
    2525  end 
    2626   
     
    4848   
    4949  it "should give a boost to basename, filename and filetype in index" do 
    50     index=IndexReader.new 
     50    index=Indexer.index 
    5151    index.field_infos[:basename].boost.should > 1.0 
    5252    index.field_infos[:file].boost.should > 1.0 
  • branches/yet_another_index_structure/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb

    r214 r238  
    33describe "PlainTextExtractors" do 
    44  before(:all) do 
    5     IndexReader.ensure_existence 
     5    Indexer.ensure_index_existence 
    66  end   
    77