root/trunk/lib/picolena/templates/lib/plain_text_extractors/ms.word.rb

Revision 216, 1.2 kB (checked in by eric.dumin..@gmail.com, 6 months ago)

Ready for 0.1.4?

  • Property svn:executable set to *
Line 
1 #Word 97-2003
2
3 PlainTextExtractor.new {
4   every :doc, :dot
5   as "application/msword"
6   aka "Microsoft Office Word document"
7   with "antiword SOURCE" => :on_linux_and_mac_os,
8        "some other command" => :on_windows
9   which_should_for_example_extract 'district heating', :from => 'Types of malfunction in DH substations.doc'
10   or_extract 'Basic Word template for Picolena specs', :from => 'office2003-word-template.dot'
11 }
12
13 #Word 2007
14
15 require 'zip/zip'
16 PlainTextExtractor.new {
17   every :docx, :dotx
18   as 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
19   aka "Microsoft Office 2007 Word document"
20   with {|source|
21     Zip::ZipFile.open(source){|zipfile|
22       zipfile.read("word/document.xml").split(/</).grep(/^w:t/).collect{|l|
23         l.sub(/^[^>]+>/,'')
24       }.join("\n")
25     }
26   }
27   which_should_for_example_extract 'Can this office 2007 document be indexed\?', :from => 'office2007-word.docx'
28   or_extract 'Basic Word 2007 template for Picolena specs', :from => 'office2007-word-template.dotx'
29 }
30
31 ## Microsoft Word to text conversion:
32 ##   Program: antiword
33 ##   Version tested: 0.37
34 ##   Installation: Ubuntu antiword package
35 ##   Home page: http://www.winfield.demon.nl/
36
37 ## MS OOXML word to text conversion:
38 ## Ruby code written by Eric DUMINIL
Note: See TracBrowser for help on using the browser.