| 1 |
# Defines plain text extractors with DSL |
|---|
| 2 |
# For example, to convert "Microsoft Office Word document" to plain text |
|---|
| 3 |
# PlainTextExtractor.new { |
|---|
| 4 |
# every :doc, :dot |
|---|
| 5 |
# as "application/msword" |
|---|
| 6 |
# aka "Microsoft Office Word document" |
|---|
| 7 |
# with "antiword SOURCE" => :on_linux, "some other command" => :on_windows |
|---|
| 8 |
# which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc' |
|---|
| 9 |
# or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot' |
|---|
| 10 |
# } |
|---|
| 11 |
|
|---|
| 12 |
module PlainTextExtractorDSL |
|---|
| 13 |
attr_reader :exts, :mime_name, :description, :command, :content_and_file_examples |
|---|
| 14 |
|
|---|
| 15 |
def initialize(&block) |
|---|
| 16 |
@content_and_file_examples=[] |
|---|
| 17 |
self.instance_eval(&block) |
|---|
| 18 |
PlainTextExtractor.add(self) |
|---|
| 19 |
end |
|---|
| 20 |
|
|---|
| 21 |
def every(*exts) |
|---|
| 22 |
@exts=exts |
|---|
| 23 |
end |
|---|
| 24 |
|
|---|
| 25 |
def as(mime_name) |
|---|
| 26 |
@mime_name=mime_name |
|---|
| 27 |
end |
|---|
| 28 |
|
|---|
| 29 |
def aka(description) |
|---|
| 30 |
@description=description |
|---|
| 31 |
end |
|---|
| 32 |
|
|---|
| 33 |
def which_requires(*dependencies) |
|---|
| 34 |
@dependencies=dependencies |
|---|
| 35 |
end |
|---|
| 36 |
|
|---|
| 37 |
#used by rspec to test extractors: |
|---|
| 38 |
# which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf' |
|---|
| 39 |
# or_extract 'some other stuff inside another pdf file', :from => 'yet_another.pdf' |
|---|
| 40 |
# |
|---|
| 41 |
#this spec will pass if 'basic.pdf' and 'yet_another.pdf' are included in an indexed directory, if every dependency is installed, |
|---|
| 42 |
#and if plain text output from the extractor applied to 'basic.pdf' and 'yet_another.pdf' respectively include 'in a pdf file' and 'some other stuff inside another pdf file' |
|---|
| 43 |
def which_should_for_example_extract(content, file) |
|---|
| 44 |
@content_and_file_examples << [content,file[:from]] |
|---|
| 45 |
end |
|---|
| 46 |
|
|---|
| 47 |
#it allows to define specs in this way: |
|---|
| 48 |
# which_should_for_example_extract 'Hello world!', :from => 'hello.rb' |
|---|
| 49 |
# or_extract 'text inside!', :from => 'crossed.txt' |
|---|
| 50 |
alias_method :or_extract, :which_should_for_example_extract |
|---|
| 51 |
|
|---|
| 52 |
def with(command_as_hash_or_string=nil,&block) |
|---|
| 53 |
#TODO: Find a better way to manage platforms, and include OS X, Vista, BSD... |
|---|
| 54 |
platform=case RUBY_PLATFORM |
|---|
| 55 |
when |
|---|
| 56 |
:linux |
|---|
| 57 |
when |
|---|
| 58 |
:windows |
|---|
| 59 |
when |
|---|
| 60 |
:mac_os |
|---|
| 61 |
end |
|---|
| 62 |
@command=case command_as_hash_or_string |
|---|
| 63 |
when String |
|---|
| 64 |
command_as_hash_or_string |
|---|
| 65 |
when Hash |
|---|
| 66 |
# Allows to write |
|---|
| 67 |
# with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os, |
|---|
| 68 |
# "some other command" => :on_windows |
|---|
| 69 |
# |
|---|
| 70 |
# On linux and mac_os platforms, it returns "pdftotext -enc UTF-8 SOURCE -", |
|---|
| 71 |
# on windows, it returns "some other command" |
|---|
| 72 |
# |
|---|
| 73 |
# If commands for linux & mac os were different : |
|---|
| 74 |
# with "some command" => :on_linux, |
|---|
| 75 |
# "another command" => :on_mac_os, |
|---|
| 76 |
# "yet another command" => :on_windows |
|---|
| 77 |
# |
|---|
| 78 |
#TODO: Make it clearer and more robust. |
|---|
| 79 |
#NOTE: What to do when no command is defined for a given platform? |
|---|
| 80 |
command_as_hash_or_string.invert.find{|platforms,command| |
|---|
| 81 |
platforms.to_s.split().collect{|on_platform| on_platform.sub(,'').to_sym}.include?(platform) |
|---|
| 82 |
}.last.dup |
|---|
| 83 |
else |
|---|
| 84 |
block || raise("No command defined for this extractor: #{description}") |
|---|
| 85 |
end |
|---|
| 86 |
# TODO, replace it with Open3 or something. |
|---|
| 87 |
@command<<' 2>/dev/null' if (@command.is_a?(String) && platform.to_s=~ && !@command.include?('|')) |
|---|
| 88 |
end |
|---|
| 89 |
end |
|---|