Changeset 247
- Timestamp:
- 04/24/08 10:04:46 (7 months ago)
- Files:
-
- trunk/History.txt (modified) (1 diff)
- trunk/Manifest.txt (modified) (2 diffs)
- trunk/lib/picolena/templates/app/helpers/documents_helper.rb (modified) (1 diff)
- trunk/lib/picolena/templates/app/models/document.rb (modified) (2 diffs)
- trunk/lib/picolena/templates/app/models/finder.rb (modified) (4 diffs)
- trunk/lib/picolena/templates/app/models/index_reader.rb (deleted)
- trunk/lib/picolena/templates/app/models/index_writer.rb (deleted)
- trunk/lib/picolena/templates/app/models/indexer.rb (modified) (2 diffs)
- trunk/lib/picolena/templates/app/models/plain_text_extractor.rb (modified) (2 diffs)
- trunk/lib/picolena/templates/app/models/query.rb (modified) (2 diffs)
- trunk/lib/picolena/templates/lang/ui/de.yml (modified) (1 diff)
- trunk/lib/picolena/templates/lang/ui/en.yml (modified) (1 diff)
- trunk/lib/picolena/templates/lang/ui/es.yml (modified) (1 diff)
- trunk/lib/picolena/templates/lang/ui/fr.yml (modified) (1 diff)
- trunk/lib/picolena/templates/lib/tasks/index.rake (modified) (1 diff)
- trunk/lib/picolena/templates/spec/models/basic_finder_spec.rb (modified) (3 diffs)
- trunk/lib/picolena/templates/spec/models/finder_spec.rb (modified) (3 diffs)
- trunk/lib/picolena/templates/spec/models/index_reader_spec.rb (deleted)
- trunk/lib/picolena/templates/spec/models/index_writer_spec.rb (deleted)
- trunk/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb (modified) (2 diffs)
- trunk/website/index.html (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/History.txt
r227 r247 1 1 == 0.1.5 2008-04- 2 3 * 1 major enhancement: 4 * yet another Indexer & Index rewrite 5 2 6 * 1 minor enhancement: 3 7 * flags to indicate found language 8 9 * bug fixes: 10 * No more (or just less?) index lock errors 4 11 5 12 == 0.1.4 2008-04-23 trunk/Manifest.txt
r228 r247 23 23 lib/picolena/templates/app/models/document.rb 24 24 lib/picolena/templates/app/models/finder.rb 25 lib/picolena/templates/app/models/index_reader.rb26 lib/picolena/templates/app/models/index_writer.rb27 25 lib/picolena/templates/app/models/indexer.rb 28 26 lib/picolena/templates/app/models/plain_text_extractor.rb … … 137 135 lib/picolena/templates/spec/models/host_indexing_system_spec.rb 138 136 lib/picolena/templates/spec/models/index_directories_spec.rb 139 lib/picolena/templates/spec/models/index_reader_spec.rb140 lib/picolena/templates/spec/models/index_writer_spec.rb141 137 lib/picolena/templates/spec/models/indexer_spec.rb 142 138 lib/picolena/templates/spec/models/plain_text_extractor_spec.rb trunk/lib/picolena/templates/app/helpers/documents_helper.rb
r228 r247 39 39 40 40 def language_icon_for(document) 41 (lang=document.lang ) && image_tag("flags/#{lang}.png")41 (lang=document.language) && image_tag("flags/#{lang}.png") 42 42 end 43 43 trunk/lib/picolena/templates/app/models/document.rb
r190 r247 72 72 # Useful to know how old a document is, and to which version the cache corresponds. 73 73 def date 74 from_index[: date].sub(/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/,'\1-\2-\3 \4:\5:\6')74 from_index[:modified].sub(/(\d{4})(\d{2})(\d{2})(\d{2})(\d{2})(\d{2})/,'\1-\2-\3 \4:\5:\6') 75 75 end 76 76 77 77 def mtime 78 from_index[: date].to_i78 from_index[:modified].to_i 79 79 end 80 80 81 81 # Returns language. 82 def lang 83 from_index[:lang ]82 def language 83 from_index[:language] 84 84 end 85 85 86 86 # Returns the id with which the document is indexed. 87 87 def index_id 88 @index_id ||= Document.find_by_complete_path(complete_path).index_id 88 @index_id ||= Finder.term_search(:complete_path, complete_path).doc 89 end 90 91 # Fields that are shared between every document. 92 def self.default_fields_for(complete_path) 93 { 94 :complete_path => complete_path, 95 :probably_unique_id => complete_path.base26_hash, 96 :filename => File.basename(complete_path), 97 :basename => File.basename(complete_path, File.extname(complete_path)).gsub(/_/,' '), 98 :filetype => File.extname(complete_path), 99 :modified => File.mtime(complete_path).strftime("%Y%m%d%H%M%S") 100 } 89 101 end 90 102 … … 94 106 # Useful to get meta-info about it. 95 107 def from_index 96 Index Reader.new[index_id]108 Indexer.index[index_id] 97 109 end 98 110 99 111 def self.find_by_unique_id(some_id) 100 Finder.new("probably_unique_id:"<<some_id).matching_document 112 doc_id=Finder.term_search(:probably_unique_id, some_id).doc 113 new(Indexer.index[doc_id][:complete_path]) 101 114 end 102 103 def self.find_by_complete_path(complete_path) 104 Finder.new('complete_path:"'<<complete_path<<'"').matching_document 105 end 106 115 107 116 def in_indexed_directory? 108 117 !indexed_directory.nil? trunk/lib/picolena/templates/app/models/finder.rb
r177 r247 3 3 4 4 def index 5 # caching index @@index ||= 6 # causes ferret-0.11.6/lib/ferret/index.rb:768: [BUG] Segmentation fault 7 IndexReader.new 5 @@index ||= Indexer.index 8 6 end 9 7 … … 11 9 @query = Query.extract_from(raw_query) 12 10 @raw_query= raw_query 13 Index Reader.ensure_existence11 Indexer.ensure_index_existence 14 12 @per_page=results_per_page 15 13 @offset=(page.to_i-1)*results_per_page 16 index .should_have_documents14 index_should_have_documents 17 15 end 18 16 … … 32 30 found_doc.index_id=index_id 33 31 @matching_documents<<found_doc 34 rescue Errno::ENOENT35 #"File has been moved/deleted!"36 end32 rescue Errno::ENOENT 33 #"File has been moved/deleted!" 34 end 37 35 } 38 36 @executed=true … … 61 59 # exactly one document is found. 62 60 # Raises otherwise. 63 def matching_document 64 case matching_documents.size 65 when 0 66 raise IndexError, "No document found" 67 when 1 68 matching_documents.first 69 else 70 raise IndexError, "More than one document found" 71 end 72 end 61 def matching_document 62 case matching_documents.size 63 when 0 64 raise IndexError, "No document found" 65 when 1 66 matching_documents.first 67 else 68 raise IndexError, "More than one document found" 69 end 70 end 71 72 class<<self 73 def searcher 74 @@searcher ||= Ferret::Search::Searcher.new(Picolena::IndexSavePath) 75 end 76 77 def term_search(field,term) 78 query = Ferret::Search::TermQuery.new(field,term) 79 searcher.search(query).hits.first 80 end 81 82 def reload! 83 @@searcher = nil 84 @@index = nil 85 end 86 end 87 88 private 89 90 def index_should_have_documents 91 raise IndexError, "no document found" unless index.size > 0 92 end 73 93 end trunk/lib/picolena/templates/app/models/indexer.rb
r211 r247 6 6 7 7 class << self 8 def fields_for(complete_path) 9 { 10 :complete_path => complete_path, 11 :probably_unique_id => complete_path.base26_hash, 12 :file => File.basename(complete_path), 13 :basename => File.basename(complete_path, File.extname(complete_path)).gsub(/_/,' '), 14 :filetype => File.extname(complete_path), 15 :date => File.mtime(complete_path).strftime("%Y%m%d%H%M%S") 16 } 17 end 18 19 def index_every_directory(update=true) 8 def index_every_directory(remove_first=false) 9 clear! if remove_first 10 # Forces Finder.searcher and Finder.index to be reloaded, by removing them from the cache. 11 Finder.reload! 20 12 log :debug => "Indexing every directory" 21 22 23 13 start=Time.now 24 @update = update25 reset! unless update26 27 14 Picolena::IndexedDirectories.each{|dir, alias_dir| 28 15 index_directory_with_multithreads(dir) 29 16 } 30 # FIXME: with those 2 lines,17 log :debug => "Now optimizing index" 31 18 writer.optimize 32 writer.close33 # launching Indexer.index_every_directory twice in a row34 # would raise a SEGFAULT:35 # picolena/lib/picolena/templates/app/models/indexer.rb:27: [BUG] Segmentation fault36 # ruby 1.8.6 (2007-06-07) [i486-linux]37 #38 # Aborted (core dumped)39 #40 # But without those 2 lines, specs don't pass anymore.41 #42 19 log :debug => "Indexing done in #{Time.now-start} s." 43 20 end 44 21 45 22 def index_directory_with_multithreads(dir) 46 # FIXME: Don't know why, but if more than one thread is created while update the index, 47 # indexer raises: 48 # 49 # current thread not owner 50 # /usr/lib/ruby/1.8/monitor.rb:278:in `mon_check_owner' 51 # /home/www/picolena/lib/picolena/templates/lib/core_exts.rb:32:in `join' 52 # ... 53 # 54 # So Index creation is multithreaded, Index update is monothreaded. 55 threads_number = @update ? 1 : @@max_threads_number 23 threads_number = @@max_threads_number 56 24 log :debug => "Indexing #{dir}, #{threads_number} thread(s)" 57 25 … … 62 30 indexing_list_chunks=indexing_list.in_transposed_slices(threads_number) 63 31 32 # It initializes an IndexWriter before launching multithreaded 33 # indexing. Otherwise, two threads could try to instantiate 34 # an IndexWriter at the same time, and get a 35 # Ferret::Store::Lock::LockError 36 writer 37 64 38 indexing_list_chunks.each_with_thread{|chunk| 65 39 chunk.each{|filename| 66 add_ or_update_file(filename)40 add_file(filename) 67 41 } 68 42 } 69 43 end 70 44 71 def add_or_update_file(complete_path) 72 should_be_added = true 73 if @update then 74 log :debug => "What to do with #{complete_path} ?" 75 occurences = reader.occurences_number(complete_path) 76 log :debug => "\tappears #{occurences} times in the index" 77 case occurences 78 when 0 79 #Nothing to do here, the file will be added. 80 when 1 81 d=Document.find_by_complete_path(complete_path) 82 if File.mtime(complete_path).strftime("%Y%m%d%H%M%S").to_i > d.mtime then 83 log :debug => "\thas been modified" 84 delete_file(complete_path) 85 else 86 should_be_added = false 87 log :debug => "\thas not been modified. leaving it" 88 end 89 else 90 delete_file(complete_path) 91 end 45 def add_file(complete_path) 46 default_fields = Document.default_fields_for(complete_path) 47 begin 48 document = PlainTextExtractor.extract_content_and_language_from(complete_path) 49 raise "empty document #{complete_path}" if document[:content].strip.empty? 50 document.merge! default_fields 51 log :debug => ["Added : #{complete_path}",document[:language] ? " (#{document[:language]})" : ""].join 52 rescue => e 53 log :debug => "\tindexing without content: #{e.message}" 54 document = default_fields 92 55 end 93 add_file(complete_path) if should_be_added56 writer << document 94 57 end 95 58 96 def add_file(complete_path) 97 log :debug => "Adding #{complete_path}" 98 mime_type=File.mime(complete_path) 99 fields = fields_for(complete_path) 100 101 begin 102 text, lang = PlainTextExtractor.extract_content_and_language_from(complete_path) 103 raise "\tempty document #{complete_path}" if text.strip.empty? 104 fields[:content] = text 105 log :debug => "language found: #{lang}" if lang 106 fields[:lang] = lang 107 rescue => e 108 log :debug => "\tindexing without content: #{e.message}" 109 end 110 111 writer << fields 59 # Ensures writer is closed, and removes every index file for RAILS_ENV. 60 def clear!(all=false) 61 close 62 to_remove=all ? Picolena::IndexesSavePath : Picolena::IndexSavePath 63 Dir.glob(File.join(to_remove,'**/*')).each{|f| FileUtils.rm(f) if File.file?(f)} 112 64 end 113 65 114 def writer 115 @@writer ||= IndexWriter.new 66 # Closes the writer and 67 # ensures that a new IndexWriter is instantiated next time writer is called. 68 def close 69 @@writer.close rescue nil 70 # Ferret will SEGFAULT otherwise. 71 @@writer = nil 116 72 end 117 73 118 def reader 119 @@reader ||= IndexReader.new 74 # Only one IndexWriter should be instantiated. 75 # If one already exists, returns it. 76 # Creates it otherwise. 77 def writer 78 @@writer ||= Ferret::Index::IndexWriter.new(default_index_params) 120 79 end 121 80 122 def reset! 123 log :debug => "Resetting Index" 124 @@writer=nil 125 @@reader=nil 126 IndexWriter.remove 81 def index 82 Ferret::Index::Index.new(default_index_params) 127 83 end 128 84 129 def delete_file(complete_path) 130 log :debug => "\tRemoving from index" 131 reader.delete_by_complete_path(complete_path) 85 def ensure_index_existence 86 index_every_directory(:remove_first) unless index_exists? or RAILS_ENV=="production" 132 87 end 133 88 134 89 private 135 90 91 def index_exists? 92 index_filename and File.exists?(index_filename) 93 end 94 95 def index_filename 96 Dir.glob(File.join(Picolena::IndexSavePath,'*.cfs')).first 97 end 98 136 99 def log(hash) 137 100 hash.each{|level,message| 138 101 IndexerLogger.send(level,message) 139 102 } 140 end 103 end 104 105 def default_index_params 106 {:path => Picolena::IndexSavePath, :analyzer => Picolena::Analyzer, :field_infos => default_field_infos} 107 end 108 109 def default_field_infos 110 returning Ferret::Index::FieldInfos.new do |field_infos| 111 field_infos.add_field(:complete_path, :store => :yes, :index => :untokenized) 112 field_infos.add_field(:content, :store => :yes, :index => :yes) 113 field_infos.add_field(:basename, :store => :no, :index => :yes, :boost => 1.5) 114 field_infos.add_field(:filename, :store => :no, :index => :yes, :boost => 1.5) 115 field_infos.add_field(:filetype, :store => :no, :index => :yes, :boost => 1.5) 116 field_infos.add_field(:modified, :store => :yes, :index => :untokenized) 117 field_infos.add_field(:probably_unique_id, :store => :no, :index => :yes) 118 field_infos.add_field(:language, :store => :yes, :index => :yes) 119 end 120 end 141 121 end 142 122 end trunk/lib/picolena/templates/app/models/plain_text_extractor.rb
r219 r247 110 110 def extract_content_and_language 111 111 content=extract_content 112 return [content, nil]unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb)113 Picolena::UseLanguageRecognition,114 # Is a language guesser already installed?115 PlainTextExtractor.language_guesser,116 # Language recognition is too unreliable for small files.117 content.size > 500].all?112 return {:content => content} unless [# Is LanguageRecognition turned on? (cf config/custom/picolena.rb) 113 Picolena::UseLanguageRecognition, 114 # Is a language guesser already installed? 115 PlainTextExtractor.language_guesser, 116 # Language recognition is too unreliable for small files. 117 content.size > 500].all? 118 118 language=IO.popen(PlainTextExtractor.language_guesser,'w+'){|lang_guesser| 119 119 lang_guesser.write content … … 126 126 end 127 127 } 128 [content,language]128 {:content => content, :language => language} 129 129 end 130 130 end trunk/lib/picolena/templates/app/models/query.rb
r177 r247 14 14 /\b#{:OR.l}\b/=>'OR', 15 15 /\b#{:NOT.l}\b/=>'NOT', 16 /(#{:filename.l}):/=>'filename:', 16 17 /(#{:filetype.l}):/=>'filetype:', 17 18 /#{:content.l}:/ => 'content:', 18 /#{:date.l}:/ => 'date:', 19 /(#{:modified.l}):/ => 'modified:', 20 /(#{:language.l}):/ => 'language:', 19 21 /\b#{:LIKE.l}\s+(\S+)/=>'\1~' 20 22 } … … 26 28 # Instantiates a QueryParser once, and keeps it in cache. 27 29 def parser 28 @@parser ||= Ferret::QueryParser.new(:fields => [:content, :file , :basename, :filetype, :date], :or_default => false, :analyzer=>Picolena::Analyzer)30 @@parser ||= Ferret::QueryParser.new(:fields => [:content, :filename, :basename, :filetype, :modified], :or_default => false, :analyzer=>Picolena::Analyzer) 29 31 end 30 32 end trunk/lib/picolena/templates/lang/ui/de.yml
r128 r247 20 20 21 21 ## Fields 22 filename: filename|file|datei 22 23 filetype: erweiterung|ext 23 24 content: inhalt 24 date: jahr|zeit 25 modified: jahr|zeit|geÀndert 26 language: lang|sprache trunk/lib/picolena/templates/lang/ui/en.yml
r128 r247 20 20 21 21 ## Fields 22 filename: filename|file 22 23 filetype: filetype|ext 23 24 content: content 24 date: year|date 25 modified: year|date|modified 26 language: lang|language trunk/lib/picolena/templates/lang/ui/es.yml
r128 r247 20 20 21 21 ## Fields 22 filename: filename|file|archivo 22 23 filetype: extensión|ext 23 24 content: contenido 24 date: fecha|año|anho 25 modified: fecha|año|anho|modificado 26 language: lang|idioma trunk/lib/picolena/templates/lang/ui/fr.yml
r128 r247 20 20 21 21 ## Fields 22 filename: filename|file|fichier 22 23 filetype: extension|ext 23 24 content: contenu 24 date: année|date|annee 25 modified: année|date|annee|modifie 26 language: lang|langue trunk/lib/picolena/templates/lib/tasks/index.rake
r153 r247 3 3 desc 'Clear indexes' 4 4 task :clear => :environment do 5 Index Writer.remove5 Indexer.clear! :all 6 6 end 7 7 8 8 desc 'Create index' 9 9 task :create => :environment do 10 Indexer.index_every_directory( update=false)10 Indexer.index_every_directory(remove_first=true) 11 11 end 12 12 13 13 desc 'Update index' 14 14 task :update => :environment do 15 Indexer.index_every_directory (update=true)15 Indexer.index_every_directory 16 16 end 17 17 trunk/lib/picolena/templates/spec/models/basic_finder_spec.rb
r167 r247 11 11 12 12 before(:each) do 13 Index Writer.remove13 Indexer.clear! 14 14 end 15 15 16 16 it "should create index" do 17 17 Picolena::IndexedDirectories.replace({'spec/test_dirs/indexed/just_one_doc'=>'//justonedoc/'}) 18 lambda {@finder_with_new_index=Finder.new("test moi")}.should change(Index Reader, :exists?).from(false).to(true)18 lambda {@finder_with_new_index=Finder.new("test moi")}.should change(Indexer, :index_exists?).from(false).to(true) 19 19 File.exists?(File.join(@new_index_path,'_0.cfs')).should be_true 20 Index Reader.new.size.should >020 Indexer.index.size.should >0 21 21 end 22 22 … … 36 36 fields={ 37 37 # description => key 38 :content=>:content, 39 :basename=>:basename, 40 :filename=>:file, 41 :extension => :filetype, 42 :modification_time=>:date 38 :content => :content, 39 :complete_path => :complete_path, 40 :basename => :basename, 41 :filename => :filename, 42 :extension => :filetype, 43 :modification_time => :modified, 44 :probably_unique_id => :probably_unique_id, 45 :language => :language 43 46 } 44 47 45 48 describe "Basic Finder" do 46 49 before(:all) do 47 Indexer.index_every_directory( update=false)50 Indexer.index_every_directory(remove_first=true) 48 51 end 49 52 … … 83 86 fields.each_pair do |description,field_name| 84 87 it "should index #{description} as :#{field_name}" do 85 Index Reader.new.field_infos[field_name].should be_an_instance_of(Ferret::Index::FieldInfo)88 Indexer.index.field_infos[field_name].should be_an_instance_of(Ferret::Index::FieldInfo) 86 89 end 87 90 end trunk/lib/picolena/templates/spec/models/finder_spec.rb
r217 r247 22 22 File.utime(0, a_bit_later, 'spec/test_dirs/indexed/yet_another_dir/office2003-word-template.dot') 23 23 File.utime(0, nineties, 'spec/test_dirs/indexed/others/placeholder.txt') 24 Indexer.index_every_directory( update=false)24 Indexer.index_every_directory(remove_first=true) 25 25 end 26 26 … … 31 31 end 32 32 33 it "should find documents according to their filename when specified with file:query " do34 Finder.new("file :crossed.text").matching_documents.collect{|d| d.content}.should include("txt inside!")33 it "should find documents according to their filename when specified with file:query or filename:query" do 34 Finder.new("filename:crossed.text").matching_documents.collect{|d| d.content}.should include("txt inside!") 35 35 Finder.new("file:crossed.txt").matching_documents.collect{|d| d.content}.should include("text inside!") 36 36 end … … 48 48 49 49 it "should give a boost to basename, filename and filetype in index" do 50 index=Index Reader.new50 index=Indexer.index 51 51 index.field_infos[:basename].boost.should > 1.0 52 index.field_infos[:file ].boost.should > 1.052 index.field_infos[:filename].boost.should > 1.0 53 53 index.field_infos[:filetype].boost.should > 1.0 54 54 end trunk/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb
r214 r247 3 3 describe "PlainTextExtractors" do 4 4 before(:all) do 5 Index Reader.ensure_existence5 Indexer.ensure_index_existence 6 6 end 7 7 … … 30 30 31 31 it "should guess language when enough content is available" do 32 Document.new("spec/test_dirs/indexed/lang/goethe").lang .should == "de"33 Document.new("spec/test_dirs/indexed/lang/shakespeare").lang .should == "en"34 Document.new("spec/test_dirs/indexed/lang/lorca").lang .should == "es"35 Document.new("spec/test_dirs/indexed/lang/hugo").lang .should == "fr"32 Document.new("spec/test_dirs/indexed/lang/goethe").language.should == "de" 33 Document.new("spec/test_dirs/indexed/lang/shakespeare").language.should == "en" 34 Document.new("spec/test_dirs/indexed/lang/lorca").language.should == "es" 35 Document.new("spec/test_dirs/indexed/lang/hugo").language.should == "fr" 36 36 end 37 37 38 38 it "should not try to guess language when file is too small" do 39 Document.new("spec/test_dirs/indexed/basic/hello.rb").lang .should be_empty40 Document.new("spec/test_dirs/indexed/README").lang .should be_empty39 Document.new("spec/test_dirs/indexed/basic/hello.rb").language.should be_nil 40 Document.new("spec/test_dirs/indexed/README").language.should be_nil 41 41 end 42 42 end trunk/website/index.html
r210 r247 34 34 <div id="version" class="clickable" onclick='document.location = "http://rubyforge.org/projects/picolena"; return false'> 35 35 <p>Get Version</p> 36 <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1. 4</a>36 <a href="http://rubyforge.org/projects/picolena" class="numbers">0.1.5</a> 37 37 </div> 38 38 <h1>→ ‘picolena’</h1> … … 115 115 <p>Comments are welcome. Send an email to <a href="mailto:eric_duminil@rubyforge.org">Eric Duminil</a> email via the <a href="http://groups.google.com/group/picolena">forum</a></p> 116 116 <p class="coda"> 117 <a href="eric_duminil@rubyforge.org">Eric DUMINIL</a>, 9th April 2008<br>117 <a href="eric_duminil@rubyforge.org">Eric DUMINIL</a>, 20th April 2008<br> 118 118 Theme extended from <a href="http://rb2js.rubyforge.org/">Paul Battley</a>, 119 119 by Daniel Cadenas via <a href="http://depgraph.rubyforge.org/">DepGraph</a>
