Changeset 188

Show
Ignore:
Timestamp:
04/20/08 08:04:25 (7 months ago)
Author:
eric.dumin..@gmail.com
Message:

Lang recognition with specs!

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • branches/lang_recognition/Manifest.txt

    r168 r188  
    2222lib/picolena/templates/app/helpers/documents_helper.rb 
    2323lib/picolena/templates/app/models/document.rb 
    24 lib/picolena/templates/app/models/filter.rb 
    2524lib/picolena/templates/app/models/finder.rb 
    2625lib/picolena/templates/app/models/index_reader.rb 
    2726lib/picolena/templates/app/models/index_writer.rb 
    2827lib/picolena/templates/app/models/indexer.rb 
     28lib/picolena/templates/app/models/plain_text_extractor.rb 
    2929lib/picolena/templates/app/models/query.rb 
    3030lib/picolena/templates/app/views/documents/_document.html.haml 
     
    5151lib/picolena/templates/lang/ui/fr.yml 
    5252lib/picolena/templates/lib/core_exts.rb 
    53 lib/picolena/templates/lib/filter_DSL.rb 
    54 lib/picolena/templates/lib/filters/adobe.pdf.rb 
    55 lib/picolena/templates/lib/filters/html.rb 
    56 lib/picolena/templates/lib/filters/ms.excel.rb 
    57 lib/picolena/templates/lib/filters/ms.powerpoint.rb 
    58 lib/picolena/templates/lib/filters/ms.rtf.rb 
    59 lib/picolena/templates/lib/filters/ms.word.rb 
    60 lib/picolena/templates/lib/filters/opendocument.presentation.rb 
    61 lib/picolena/templates/lib/filters/opendocument.spreadsheet.rb 
    62 lib/picolena/templates/lib/filters/opendocument.text.rb 
    63 lib/picolena/templates/lib/filters/plain_text.rb 
     53lib/picolena/templates/lib/plain_text_extractor_DSL.rb 
     54lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb 
     55lib/picolena/templates/lib/plain_text_extractors/html.rb 
     56lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb 
     57lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb 
     58lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb 
     59lib/picolena/templates/lib/plain_text_extractors/ms.word.rb 
     60lib/picolena/templates/lib/plain_text_extractors/opendocument.presentation.rb 
     61lib/picolena/templates/lib/plain_text_extractors/opendocument.spreadsheet.rb 
     62lib/picolena/templates/lib/plain_text_extractors/opendocument.text.rb 
     63lib/picolena/templates/lib/plain_text_extractors/plain_text.rb 
    6464lib/picolena/templates/lib/tasks/annotations.rake 
    6565lib/picolena/templates/lib/tasks/index.rake 
     
    118118lib/picolena/templates/spec/models/basic_finder_spec.rb 
    119119lib/picolena/templates/spec/models/document_spec.rb 
    120 lib/picolena/templates/spec/models/filters_spec.rb 
    121120lib/picolena/templates/spec/models/finder_spec.rb 
    122121lib/picolena/templates/spec/models/host_indexing_system_spec.rb 
     
    125124lib/picolena/templates/spec/models/index_writer_spec.rb 
    126125lib/picolena/templates/spec/models/indexer_spec.rb 
     126lib/picolena/templates/spec/models/plain_text_extractor_spec.rb 
    127127lib/picolena/templates/spec/models/query_spec.rb 
    128128lib/picolena/templates/spec/rcov.opts 
     
    150150lib/picolena/templates/spec/test_dirs/indexed/different_encodings/utf-8.txt 
    151151lib/picolena/templates/spec/test_dirs/indexed/just_one_doc/for_test.txt 
     152lib/picolena/templates/spec/test_dirs/indexed/lang/goethe 
     153lib/picolena/templates/spec/test_dirs/indexed/lang/hugo 
     154lib/picolena/templates/spec/test_dirs/indexed/lang/lorca 
     155lib/picolena/templates/spec/test_dirs/indexed/lang/shakespeare 
    152156lib/picolena/templates/spec/test_dirs/indexed/literature/Simulation of district heating systems for evaluation of real-time control strategies.pdf 
    153157lib/picolena/templates/spec/test_dirs/indexed/literature/Types of malfunction in DH substations.doc 
  • branches/lang_recognition/lib/picolena/config/basic.rb

    r177 r188  
    1212  Globalite.language = :en 
    1313   
     14   
     15  # Is more than one language used in indexed documents? 
     16  # Picolena can try to recognise the language used, and save it in the index. 
     17  # It is then possible to look for documents according to their language. 
     18  # 
     19  # If every document is written in the same language, turning UseLanguageRecognition to false 
     20  # will speed up the indexing process 
     21  UseLanguageRecognition = true 
    1422   
    1523  # Specify which locale should be used by Ferret 
  • branches/lang_recognition/lib/picolena/templates/app/models/document.rb

    r180 r188  
    7979  end 
    8080   
     81  # Returns language. 
     82  def lang 
     83    from_index[:lang] 
     84  end 
     85   
    8186  # Returns the id with which the document is indexed. 
    8287  def index_id 
  • branches/lang_recognition/lib/picolena/templates/app/models/index_writer.rb

    r177 r188  
    3232    field_infos.add_field(:date,               :store => :yes, :index => :yes) 
    3333    field_infos.add_field(:probably_unique_id, :store => :no,  :index => :yes) 
     34    field_infos.add_field(:lang,               :store => :yes,  :index => :yes) 
    3435  end  
    3536end 
  • branches/lang_recognition/lib/picolena/templates/app/models/indexer.rb

    r178 r188  
    100100       
    101101      begin  
    102         text = PlainTextExtractor.extract_content_from(complete_path) 
     102        text, lang = PlainTextExtractor.extract_content_and_language_from(complete_path) 
    103103        raise "\tempty document #{complete_path}" if text.strip.empty? 
    104104        fields[:content] = text 
     105        log :debug => "language found: #{lang}" if lang 
     106        fields[:lang] = lang 
    105107      rescue => e 
    106108        log :debug => "\tindexing without content: #{e.message}" 
     
    134136    def log(hash) 
    135137      hash.each{|level,message| 
    136         puts "#{level} -> #{message}" 
    137138        IndexerLogger.send(level,message) 
    138139      } 
  • branches/lang_recognition/lib/picolena/templates/app/models/plain_text_extractor.rb

    r179 r188  
    4141    def extract_content_from(source) 
    4242      find_by_filename(source).extract_content 
     43    end 
     44     
     45    def extract_content_and_language_from(source) 
     46      find_by_filename(source).extract_content_and_language 
    4347    end 
    4448  end 
     
    9094      command.call(source) 
    9195    end 
    92   end     
     96  end 
     97   
     98  # Returns plain text content and language of source file, 
     99  # using mguesser to guess used language. 
     100  # This method only returns probable language if the content is bigger than 500 chars 
     101  # and if probability score is higher than 90%. 
     102  def extract_content_and_language 
     103    content=extract_content 
     104    # Language recognition is too unreliable for small files. 
     105    return [content, nil] unless Picolena::UseLanguageRecognition && content.size > 500 
     106    language=IO.popen("mguesser -n1",'w+'){|lang_guesser| 
     107      lang_guesser.write content 
     108      lang_guesser.close_write 
     109      output=lang_guesser.read 
     110      if output=~/^([01]\.\d+)\t(\w+)\t(\w+)/ then 
     111        score, lang, encoding = $1.to_f, $2, $3 
     112        # Language recognition isn't reliable if score is too low. 
     113        lang unless score<0.9 
     114      end 
     115    } 
     116    [content,language] 
     117  end 
    93118end 
  • branches/lang_recognition/lib/picolena/templates/spec/models/plain_text_extractor_spec.rb

    r178 r188  
    2828    } 
    2929  } 
     30   
     31  it "should guess language when enough content is available" do 
     32    Document.new("spec/test_dirs/indexed/lang/goethe").lang.should == "de" 
     33    Document.new("spec/test_dirs/indexed/lang/shakespeare").lang.should == "en" 
     34    Document.new("spec/test_dirs/indexed/lang/lorca").lang.should == "es" 
     35    Document.new("spec/test_dirs/indexed/lang/hugo").lang.should == "fr" 
     36  end 
     37   
     38  it "should not try to guess language when file is too small" do 
     39    Document.new("spec/test_dirs/indexed/basic/hello.rb").lang.should be_empty 
     40    Document.new("spec/test_dirs/indexed/README").lang.should be_empty 
     41  end 
    3042end