Changeset 216
- Timestamp:
- 04/23/08 02:13:22 (7 months ago)
- Files:
-
- trunk/History.txt (modified) (1 diff)
- trunk/lib/picolena/templates/lib/plain_text_extractor_DSL.rb (modified) (2 diffs)
- trunk/lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb (modified) (1 diff)
- trunk/lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb (modified) (1 diff)
- trunk/lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb (modified) (1 diff)
- trunk/lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb (modified) (1 diff)
- trunk/lib/picolena/templates/lib/plain_text_extractors/ms.word.rb (modified) (1 diff)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/History.txt
r205 r216 1 == 0.1.4 2008-04- 2 3 1 == 0.1.4 2008-04-23 2 * 1 minor enhancement: 3 * minimal MacOS support 4 4 5 5 == 0.1.3 2008-04-20 trunk/lib/picolena/templates/lib/plain_text_extractor_DSL.rb
r178 r216 55 55 platform=case RUBY_PLATFORM 56 56 when /linux/ 57 : on_linux57 :linux 58 58 when /win/ 59 :on_windows 59 :windows 60 when /darwin/ 61 :mac_os 60 62 end 61 63 @command=case command_as_hash_or_string … … 63 65 command_as_hash_or_string 64 66 when Hash 65 #dup must be used, otherwise @command gets frozen. No idea why though.... 66 command_as_hash_or_string.invert[platform].dup 67 # Allows to write 68 # with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os, 69 # "some other command" => :on_windows 70 # 71 # On linux and mac_os platforms, it returns "pdftotext -enc UTF-8 SOURCE -", 72 # on windows, it returns "some other command" 73 # 74 # If commands for linux & mac os were different : 75 # with "some command" => :on_linux, 76 # "another command" => :on_mac_os, 77 # "yet another command" => :on_windows 78 # 79 #TODO: Make it clearer and more robust. 80 #NOTE: What to do when no command is defined for a given platform? 81 command_as_hash_or_string.invert.find{|platforms,command| 82 platforms.to_s.split(/_?and_?/i).collect{|on_platform| on_platform.sub(/on_/,'').to_sym}.include?(platform) 83 }.last.dup 67 84 else 68 85 block || raise("No command defined for this extractor: #{description}") 69 86 end 70 @command<<' 2>/dev/null' if (@command.is_a?(String) && platform==:on_linux && !@command.include?('|')) 87 # TODO, replace it with Open3 or something. 88 @command<<' 2>/dev/null' if (@command.is_a?(String) && platform.to_s=~/(linux|mac_os)/ && !@command.include?('|')) 71 89 end 72 90 end trunk/lib/picolena/templates/lib/plain_text_extractors/adobe.pdf.rb
r178 r216 9 9 as "application/pdf" 10 10 aka "Adobe Portable Document Format" 11 with "pdftotext -enc UTF-8 SOURCE -" => :on_linux, "some other command" => :on_windows 11 with "pdftotext -enc UTF-8 SOURCE -" => :on_linux_and_mac_os, 12 "some other command" => :on_windows 12 13 which_should_for_example_extract 'in a pdf file', :from => 'basic.pdf' 13 14 } trunk/lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb
r178 r216 5 5 as "application/excel" 6 6 aka "Microsoft Office Excel document" 7 with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux, "some other command" => :on_windows 7 with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux_and_mac_os, 8 "some other command" => :on_windows 8 9 which_should_for_example_extract 'Some text (should be indexed!)', :from => 'table.xls' 9 10 } trunk/lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb
r178 r216 5 5 as "application/powerpoint" 6 6 aka "Microsoft Office Powerpoint document" 7 with "catppt SOURCE" => :on_linux, "some other command" => :on_windows 7 with "catppt SOURCE" => :on_linux_and_mac_os, 8 "some other command" => :on_windows 8 9 which_should_for_example_extract 'unofficial written by OOo Impress', :from => 'one_page.ppt' 9 10 #FIXME: it seems that catppt cannot open .pps files. trunk/lib/picolena/templates/lib/plain_text_extractors/ms.rtf.rb
r178 r216 9 9 as "application/rtf" 10 10 aka "Microsoft Rich Text Format" 11 with "unrtf SOURCE -t text" => :on_linux, "some other command" => :on_windows 11 with "unrtf SOURCE -t text" => :on_linux_and_mac_os, 12 "some other command" => :on_windows 12 13 which_should_for_example_extract 'Resampling when limiting', :from => 'ReadMe.rtf' 13 14 } trunk/lib/picolena/templates/lib/plain_text_extractors/ms.word.rb
r178 r216 5 5 as "application/msword" 6 6 aka "Microsoft Office Word document" 7 with "antiword SOURCE" => :on_linux, "some other command" => :on_windows 7 with "antiword SOURCE" => :on_linux_and_mac_os, 8 "some other command" => :on_windows 8 9 which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc' 9 10 or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
