Changeset 234
- Timestamp:
- 04/24/08 03:16:24 (7 months ago)
- Files:
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
branches/yet_another_index_structure/lib/picolena/templates/app/models/indexer.rb
r233 r234 17 17 end 18 18 19 def index_every_directory (update=true)19 def index_every_directory 20 20 log :debug => "Indexing every directory" 21 22 21 start=Time.now 23 @update = update24 reset! unless update25 26 22 Picolena::IndexedDirectories.each{|dir, alias_dir| 27 23 index_directory_with_multithreads(dir) 28 24 } 29 # FIXME: with those 2 lines,30 writer.optimize31 writer.close32 # launching Indexer.index_every_directory twice in a row33 # would raise a SEGFAULT:34 # picolena/lib/picolena/templates/app/models/indexer.rb:27: [BUG] Segmentation fault35 # ruby 1.8.6 (2007-06-07) [i486-linux]36 #37 # Aborted (core dumped)38 #39 # But without those 2 lines, specs don't pass anymore.40 #41 25 log :debug => "Indexing done in #{Time.now-start} s." 42 26 end 43 27 44 28 def index_directory_with_multithreads(dir) 45 # FIXME: Don't know why, but if more than one thread is created while update the index, 46 # indexer raises: 47 # 48 # current thread not owner 49 # /usr/lib/ruby/1.8/monitor.rb:278:in `mon_check_owner' 50 # /home/www/picolena/lib/picolena/templates/lib/core_exts.rb:32:in `join' 51 # ... 52 # 53 # So Index creation is multithreaded, Index update is monothreaded. 54 threads_number = @update ? 1 : @@max_threads_number 29 threads_number = @@max_threads_number 55 30 log :debug => "Indexing #{dir}, #{threads_number} thread(s)" 56 31 … … 70 45 def add_or_update_file(complete_path) 71 46 should_be_added = true 72 if @update then 73 log :debug => "What to do with #{complete_path} ?" 74 occurences = reader.occurences_number(complete_path) 75 log :debug => "\tappears #{occurences} times in the index" 76 case occurences 77 when 0 78 #Nothing to do here, the file will be added. 79 when 1 80 d=Document.find_by_complete_path(complete_path) 81 if File.mtime(complete_path).strftime("%Y%m%d%H%M%S").to_i > d.mtime then 82 log :debug => "\thas been modified" 83 delete_file(complete_path) 84 else 85 should_be_added = false 86 log :debug => "\thas not been modified. leaving it" 87 end 88 else 89 delete_file(complete_path) 90 end 91 end 92 add_file(complete_path) if should_be_added 47 add_file(complete_path) 93 48 end 94 49 … … 96 51 log :debug => "Adding #{complete_path}" 97 52 mime_type=File.mime(complete_path) 98 fields = fields_for(complete_path) 99 53 default_fields = fields_for(complete_path) 100 54 begin 101 text, lang = PlainTextExtractor.extract_content_and_language_from(complete_path) 102 raise "\tempty document #{complete_path}" if text.strip.empty? 103 fields[:content] = text 104 log :debug => "language found: #{lang}" if lang 105 fields[:lang] = lang 55 document = PlainTextExtractor.extract_content_and_language_from(complete_path) 56 raise "\tempty document #{complete_path}" if document[:content].strip.empty? 57 log :debug => "language found: #{document[:language]}" if document[:language] 58 document.merge! default_fields 106 59 rescue => e 107 60 log :debug => "\tindexing without content: #{e.message}" 108 61 end 109 110 writer << fields 111 end 112 113 def writer 114 @@writer ||= IndexWriter.new 115 end 116 117 def reader 118 @@reader ||= IndexReader.new 119 end 120 121 def reset! 122 log :debug => "Resetting Index" 123 @@writer=nil 124 @@reader=nil 125 IndexWriter.remove 126 end 127 128 def delete_file(complete_path) 129 log :debug => "\tRemoving from index" 130 reader.delete_by_complete_path(complete_path) 62 writer << document 131 63 end 132 64 branches/yet_another_index_structure/lib/picolena/templates/app/models/plain_text_extractor.rb
r219 r234 110 110 def extract_content_and_language 111 111 content=extract_content 112 return [content, nil]unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb)113 Picolena::UseLanguageRecognition,114 # Is a language guesser already installed?115 PlainTextExtractor.language_guesser,116 # Language recognition is too unreliable for small files.117 content.size > 500].all?112 return {:content => content} unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb) 113 Picolena::UseLanguageRecognition, 114 # Is a language guesser already installed? 115 PlainTextExtractor.language_guesser, 116 # Language recognition is too unreliable for small files. 117 content.size > 500].all? 118 118 language=IO.popen(PlainTextExtractor.language_guesser,'w+'){|lang_guesser| 119 119 lang_guesser.write content … … 126 126 end 127 127 } 128 [content,language]128 {:content => content, :language => language} 129 129 end 130 130 end
