Changeset 188
- Timestamp:
- 04/20/08 08:04:25 (7 months ago)
- Files:
-
- branches/lang_recognition/Manifest.txt (modified) (5 diffs)
- branches/lang_recognition/lib/picolena/config/basic.rb (modified) (1 diff)
- branches/lang_recognition/lib/picolena/templates/app/models/document.rb (modified) (1 diff)
- branches/lang_recognition/lib/picolena/templates/app/models/index_writer.rb (modified) (1 diff)
- branches/lang_recognition/lib/picolena/templates/app/models/indexer.rb (modified) (2 diffs)
- branches/lang_recognition/lib/picolena/templates/app/models/plain_text_extractor.rb (modified) (2 diffs)
- branches/lang_recognition/lib/picolena/templates/lib/filters (added)
- branches/lang_recognition/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb (modified) (1 diff)
- branches/lang_recognition/lib/picolena/templates/spec/test_dirs/indexed/lang (added)
- branches/lang_recognition/lib/picolena/templates/spec/test_dirs/indexed/lang/goethe (added)
- branches/lang_recognition/lib/picolena/templates/spec/test_dirs/indexed/lang/hugo (added)
- branches/lang_recognition/lib/picolena/templates/spec/test_dirs/indexed/lang/lorca (added)
- branches/lang_recognition/lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare (added)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
branches/lang_recognition/Manifest.txt
r168 r188 22 22 lib/picolena/templates/app/helpers/documents_helper.rb 23 23 lib/picolena/templates/app/models/document.rb 24 lib/picolena/templates/app/models/filter.rb25 24 lib/picolena/templates/app/models/finder.rb 26 25 lib/picolena/templates/app/models/index_reader.rb 27 26 lib/picolena/templates/app/models/index_writer.rb 28 27 lib/picolena/templates/app/models/indexer.rb 28 lib/picolena/templates/app/models/plain_text_extractor.rb 29 29 lib/picolena/templates/app/models/query.rb 30 30 lib/picolena/templates/app/views/documents/_document.html.haml … … 51 51 lib/picolena/templates/lang/ui/fr.yml 52 52 lib/picolena/templates/lib/core_exts.rb 53 lib/picolena/templates/lib/ filter_DSL.rb54 lib/picolena/templates/lib/ filters/adobe.pdf.rb55 lib/picolena/templates/lib/ filters/html.rb56 lib/picolena/templates/lib/ filters/ms.excel.rb57 lib/picolena/templates/lib/ filters/ms.powerpoint.rb58 lib/picolena/templates/lib/ filters/ms.rtf.rb59 lib/picolena/templates/lib/ filters/ms.word.rb60 lib/picolena/templates/lib/ filters/opendocument.presentation.rb61 lib/picolena/templates/lib/ filters/opendocument.spreadsheet.rb62 lib/picolena/templates/lib/ filters/opendocument.text.rb63 lib/picolena/templates/lib/ filters/plain_text.rb53 lib/picolena/templates/lib/plain_text_extractor_DSL.rb 54 lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb 55 lib/picolena/templates/lib/plain_text_extractors/html.rb 56 lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb 57 lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb 58 lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb 59 lib/picolena/templates/lib/plain_text_extractors/ms.word.rb 60 lib/picolena/templates/lib/plain_text_extractors/opendocument.presentation.rb 61 lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb 62 lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb 63 lib/picolena/templates/lib/plain_text_extractors/plain_text.rb 64 64 lib/picolena/templates/lib/tasks/annotations.rake 65 65 lib/picolena/templates/lib/tasks/index.rake … … 118 118 lib/picolena/templates/spec/models/basic_finder_spec.rb 119 119 lib/picolena/templates/spec/models/document_spec.rb 120 lib/picolena/templates/spec/models/filters_spec.rb121 120 lib/picolena/templates/spec/models/finder_spec.rb 122 121 lib/picolena/templates/spec/models/host_indexing_system_spec.rb … … 125 124 lib/picolena/templates/spec/models/index_writer_spec.rb 126 125 lib/picolena/templates/spec/models/indexer_spec.rb 126 lib/picolena/templates/spec/models/plain_text_extractor_spec.rb 127 127 lib/picolena/templates/spec/models/query_spec.rb 128 128 lib/picolena/templates/spec/rcov.opts … … 150 150 lib/picolena/templates/spec/test_dirs/indexed/different_encodings/utf-8.txt 151 151 lib/picolena/templates/spec/test_dirs/indexed/just_one_doc/for_test.txt 152 lib/picolena/templates/spec/test_dirs/indexed/lang/goethe 153 lib/picolena/templates/spec/test_dirs/indexed/lang/hugo 154 lib/picolena/templates/spec/test_dirs/indexed/lang/lorca 155 lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare 152 156 lib/picolena/templates/spec/test_dirs/indexed/literature/Simulation of district heating systems for evaluation of real-time control strategies.pdf 153 157 lib/picolena/templates/spec/test_dirs/indexed/literature/Types of malfunction in DH substations.doc branches/lang_recognition/lib/picolena/config/basic.rb
r177 r188 12 12 Globalite.language = :en 13 13 14 15 # Is more than one language used in indexed documents? 16 # Picolena can try to recognise the language used, and save it in the index. 17 # It is then possible to look for documents according to their language. 18 # 19 # If every document is written in the same language, turning UseLanguageRecognition to false 20 # will speed up the indexing process 21 UseLanguageRecognition = true 14 22 15 23 # Specify which locale should be used by Ferret branches/lang_recognition/lib/picolena/templates/app/models/document.rb
r180 r188 79 79 end 80 80 81 # Returns language. 82 def lang 83 from_index[:lang] 84 end 85 81 86 # Returns the id with which the document is indexed. 82 87 def index_id branches/lang_recognition/lib/picolena/templates/app/models/index_writer.rb
r177 r188 32 32 field_infos.add_field(:date, :store => :yes, :index => :yes) 33 33 field_infos.add_field(:probably_unique_id, :store => :no, :index => :yes) 34 field_infos.add_field(:lang, :store => :yes, :index => :yes) 34 35 end 35 36 end branches/lang_recognition/lib/picolena/templates/app/models/indexer.rb
r178 r188 100 100 101 101 begin 102 text = PlainTextExtractor.extract_content_from(complete_path)102 text, lang = PlainTextExtractor.extract_content_and_language_from(complete_path) 103 103 raise "\tempty document #{complete_path}" if text.strip.empty? 104 104 fields[:content] = text 105 log :debug => "language found: #{lang}" if lang 106 fields[:lang] = lang 105 107 rescue => e 106 108 log :debug => "\tindexing without content: #{e.message}" … … 134 136 def log(hash) 135 137 hash.each{|level,message| 136 puts "#{level} -> #{message}"137 138 IndexerLogger.send(level,message) 138 139 } branches/lang_recognition/lib/picolena/templates/app/models/plain_text_extractor.rb
r179 r188 41 41 def extract_content_from(source) 42 42 find_by_filename(source).extract_content 43 end 44 45 def extract_content_and_language_from(source) 46 find_by_filename(source).extract_content_and_language 43 47 end 44 48 end … … 90 94 command.call(source) 91 95 end 92 end 96 end 97 98 # Returns plain text content and language of source file, 99 # using mguesser to guess used language. 100 # This method only returns probable language if the content is bigger than 500 chars 101 # and if probability score is higher than 90%. 102 def extract_content_and_language 103 content=extract_content 104 # Language recognition is too unreliable for small files. 105 return [content, nil] unless Picolena::UseLanguageRecognition && content.size > 500 106 language=IO.popen("mguesser -n1",'w+'){|lang_guesser| 107 lang_guesser.write content 108 lang_guesser.close_write 109 output=lang_guesser.read 110 if output=~/^([01]\.\d+)\t(\w+)\t(\w+)/ then 111 score, lang, encoding = $1.to_f, $2, $3 112 # Language recognition isn't reliable if score is too low. 113 lang unless score<0.9 114 end 115 } 116 [content,language] 117 end 93 118 end branches/lang_recognition/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb
r178 r188 28 28 } 29 29 } 30 31 it "should guess language when enough content is available" do 32 Document.new("spec/test_dirs/indexed/lang/goethe").lang.should == "de" 33 Document.new("spec/test_dirs/indexed/lang/shakespeare").lang.should == "en" 34 Document.new("spec/test_dirs/indexed/lang/lorca").lang.should == "es" 35 Document.new("spec/test_dirs/indexed/lang/hugo").lang.should == "fr" 36 end 37 38 it "should not try to guess language when file is too small" do 39 Document.new("spec/test_dirs/indexed/basic/hello.rb").lang.should be_empty 40 Document.new("spec/test_dirs/indexed/README").lang.should be_empty 41 end 30 42 end
