root/trunk/lib/picolena/templates/lib/plain_text_extractor_DSL.rb

Revision 309, 3.3 kB (checked in by eric.dumin..@gmail.com, 7 months ago)

Occam's Razor

  • Property svn:executable set to *
Line 
1 # Defines plain text extractors with DSL
2 # For example, to convert "Microsoft Office Word document" to plain text
3 #  PlainTextExtractor.new {
4 #    every :doc, :dot
5 #    as "application/msword"
6 #    aka "Microsoft Office Word document"
7 #    with "antiword SOURCE" => :on_linux, "some other command" => :on_windows
8 #    which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
9 #    or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
10 #  }
11
12 module PlainTextExtractorDSL
13   attr_reader :exts, :mime_name, :description, :command, :content_and_file_examples
14
15   def initialize(&block)
16     @content_and_file_examples=[]
17     self.instance_eval(&block)
18     PlainTextExtractor.add(self)
19   end
20
21   def every(*exts)
22     @exts=exts
23   end
24
25   def as(mime_name)
26     @mime_name=mime_name
27   end
28
29   def aka(description)
30     @description=description
31   end
32
33   def which_requires(*dependencies)
34     @dependencies=dependencies
35   end
36
37   #used by rspec to test extractors:
38   #  which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf'
39   #  or_extract 'some other stuff inside another pdf file', :from => 'yet_another.pdf'
40   #
41   #this spec will pass if 'basic.pdf' and 'yet_another.pdf' are included in an indexed directory, if every dependency is installed,
42   #and if plain text output from the extractor applied to 'basic.pdf' and 'yet_another.pdf' respectively include 'in a pdf file' and 'some other stuff inside another pdf file'
43   def which_should_for_example_extract(content, file)
44     @content_and_file_examples << [content,file[:from]]
45   end
46
47   #it allows to define specs in this way:
48   #  which_should_for_example_extract 'Hello world!', :from => 'hello.rb'
49   #  or_extract 'text inside!', :from => 'crossed.txt'
50   alias_method :or_extract, :which_should_for_example_extract
51
52   def with(command_as_hash_or_string=nil,&block)
53     #TODO: Find a better way to manage platforms, and include OS X, Vista, BSD...
54     platform=case RUBY_PLATFORM
55     when /linux/
56       :linux
57     when /win/
58       :windows
59     when /darwin/
60       :mac_os
61     end
62     @command=case command_as_hash_or_string
63     when String
64       command_as_hash_or_string
65     when Hash
66       # Allows to write
67       #     with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os,
68       #          "some other command" => :on_windows
69       #
70       # On linux and mac_os platforms, it returns "pdftotext -enc UTF-8 SOURCE -",
71       # on windows, it returns "some other command"
72       #
73       # If commands for linux & mac os were different :
74       #     with "some command"        => :on_linux,
75       #          "another command"     => :on_mac_os,
76       #          "yet another command" => :on_windows
77       #
78       #TODO: Make it clearer and more robust.
79       #NOTE: What to do when no command is defined for a given platform?
80       command_as_hash_or_string.invert.find{|platforms,command|
81         platforms.to_s.split(/_?and_?/i).collect{|on_platform| on_platform.sub(/on_/,'').to_sym}.include?(platform)
82       }.last.dup
83     else
84       block || raise("No command defined for this extractor: #{description}")
85     end
86     # TODO, replace it with Open3 or something.
87     @command<<' 2>/dev/null' if (@command.is_a?(String) && platform.to_s=~/(linux|mac_os)/ && !@command.include?('|'))
88   end
89 end
Note: See TracBrowser for help on using the browser.