Changeset 234

Show
Ignore:
Timestamp:
04/24/08 03:16:24 (7 months ago)
Author:
eric.dumin..@gmail.com
Message:

Slim Indexer.
PlainTextExtractor?.extract_... now return hash.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • branches/yet_another_index_structure/lib/picolena/templates/app/models/indexer.rb

    r233 r234  
    1717    end     
    1818     
    19     def index_every_directory(update=true) 
     19    def index_every_directory 
    2020      log :debug => "Indexing every directory" 
    21  
    2221      start=Time.now 
    23       @update = update 
    24       reset! unless update 
    25        
    2622      Picolena::IndexedDirectories.each{|dir, alias_dir| 
    2723        index_directory_with_multithreads(dir) 
    2824      } 
    29       # FIXME: with those 2 lines, 
    30       writer.optimize 
    31       writer.close 
    32       # launching Indexer.index_every_directory twice in a row 
    33       # would raise a SEGFAULT: 
    34       # picolena/lib/picolena/templates/app/models/indexer.rb:27: [BUG] Segmentation fault 
    35       # ruby 1.8.6 (2007-06-07) [i486-linux] 
    36       # 
    37       # Aborted (core dumped) 
    38       # 
    39       # But without those 2 lines, specs don't pass anymore. 
    40       # 
    4125      log :debug => "Indexing done in #{Time.now-start} s." 
    4226    end 
    4327     
    4428    def index_directory_with_multithreads(dir) 
    45       # FIXME: Don't know why, but if more than one thread is created while update the index, 
    46       # indexer raises: 
    47       # 
    48       # current thread not owner 
    49       # /usr/lib/ruby/1.8/monitor.rb:278:in `mon_check_owner' 
    50       # /home/www/picolena/lib/picolena/templates/lib/core_exts.rb:32:in `join' 
    51       # ... 
    52       # 
    53       # So Index creation is multithreaded, Index update is monothreaded. 
    54       threads_number = @update ? 1 : @@max_threads_number 
     29      threads_number = @@max_threads_number 
    5530      log :debug => "Indexing #{dir}, #{threads_number} thread(s)" 
    5631       
     
    7045    def add_or_update_file(complete_path) 
    7146      should_be_added = true 
    72       if @update then 
    73         log :debug =>  "What to do with #{complete_path} ?" 
    74         occurences = reader.occurences_number(complete_path)  
    75         log :debug =>  "\tappears #{occurences} times in the index" 
    76         case occurences 
    77           when 0 
    78           #Nothing to do here, the file will be added. 
    79           when 1 
    80           d=Document.find_by_complete_path(complete_path) 
    81           if File.mtime(complete_path).strftime("%Y%m%d%H%M%S").to_i > d.mtime then 
    82             log :debug => "\thas been modified" 
    83             delete_file(complete_path) 
    84           else 
    85             should_be_added = false 
    86             log :debug => "\thas not been modified. leaving it" 
    87           end 
    88         else 
    89           delete_file(complete_path) 
    90         end 
    91       end 
    92       add_file(complete_path) if should_be_added 
     47      add_file(complete_path) 
    9348    end 
    9449     
     
    9651      log :debug => "Adding #{complete_path}" 
    9752      mime_type=File.mime(complete_path) 
    98       fields = fields_for(complete_path) 
    99        
     53      default_fields = fields_for(complete_path) 
    10054      begin  
    101         text, lang = PlainTextExtractor.extract_content_and_language_from(complete_path) 
    102         raise "\tempty document #{complete_path}" if text.strip.empty? 
    103         fields[:content] = text 
    104         log :debug => "language found: #{lang}" if lang 
    105         fields[:lang] = lang 
     55        document = PlainTextExtractor.extract_content_and_language_from(complete_path) 
     56        raise "\tempty document #{complete_path}" if document[:content].strip.empty? 
     57        log :debug => "language found: #{document[:language]}" if document[:language] 
     58        document.merge! default_fields 
    10659      rescue => e 
    10760        log :debug => "\tindexing without content: #{e.message}" 
    10861      end 
    109        
    110       writer << fields 
    111     end 
    112      
    113     def writer 
    114       @@writer ||= IndexWriter.new 
    115     end 
    116      
    117     def reader 
    118       @@reader ||= IndexReader.new 
    119     end 
    120      
    121     def reset! 
    122       log :debug => "Resetting Index" 
    123       @@writer=nil 
    124       @@reader=nil 
    125       IndexWriter.remove 
    126     end 
    127      
    128     def delete_file(complete_path) 
    129       log :debug => "\tRemoving from index" 
    130       reader.delete_by_complete_path(complete_path) 
     62      writer << document 
    13163    end 
    13264     
  • branches/yet_another_index_structure/lib/picolena/templates/app/models/plain_text_extractor.rb

    r219 r234  
    110110  def extract_content_and_language 
    111111    content=extract_content 
    112     return [content, nil] unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb) 
    113                                   Picolena::UseLanguageRecognition, 
    114                                   # Is a language guesser already installed? 
    115                                   PlainTextExtractor.language_guesser, 
    116                                   # Language recognition is too unreliable for small files. 
    117                                   content.size > 500].all? 
     112    return {:content => content} unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb) 
     113                                         Picolena::UseLanguageRecognition, 
     114                                         # Is a language guesser already installed? 
     115                                         PlainTextExtractor.language_guesser, 
     116                                         # Language recognition is too unreliable for small files. 
     117                                         content.size > 500].all? 
    118118    language=IO.popen(PlainTextExtractor.language_guesser,'w+'){|lang_guesser| 
    119119      lang_guesser.write content 
     
    126126      end 
    127127    } 
    128     [content,language] 
     128    {:content => content, :language => language} 
    129129  end 
    130130end