root/trunk/lib/picolena/templates/lib/plain_text_extractors/ms.powerpoint.rb

Revision 216, 1.4 kB (checked in by eric.dumin..@gmail.com, 4 months ago)

Ready for 0.1.4?

  • Property svn:executable set to *
Line 
1 #Powerpoint 97-2003
2
3 PlainTextExtractor.new {
4   every :ppt, :pps
5   as "application/powerpoint"
6   aka "Microsoft Office Powerpoint document"
7   with "catppt SOURCE" => :on_linux_and_mac_os,
8        "some other command" => :on_windows
9   which_should_for_example_extract 'unofficial written by OOo Impress', :from => 'one_page.ppt'
10   #FIXME: it seems that catppt cannot open .pps files.
11   #or_extract 'a lightweight ferret-powered search engine written in Ruby on rails.', :from => 'picolena.pps'
12 }
13
14 #Powerpoint 2007
15
16 require 'zip/zip'
17 PlainTextExtractor.new {
18   every :pptx
19   as 'application/vnd.openxmlformats-officedocument.presentationml.presentation' #could that mime BE any longer?
20   aka "Microsoft Office 2007 Powerpoint document"
21   with {|source|
22     Zip::ZipFile.open(source){|zipfile|
23       slides=zipfile.entries.select{|l| l.name=~/^ppt\/slides\/slide\d+.xml/}
24       slides.collect{|entry|
25         zipfile.read(entry).split(/</).grep(/^a:t/).collect{|l|
26             l.sub(/^[^>]+>/,'')
27           }
28       }.join("\n")
29     }
30   }
31   which_should_for_example_extract 'Welcome to Picolena (one more time!)', :from => 'office2007-powerpoint.pptx'
32 }
33
34 ## Microsoft Powerpoint to text conversion:
35 ##   Program: catppt
36 ##   Version tested: Catdoc Version 0.94.2
37 ##   Installation: Ubuntu catdoc package
38 ##   Home page: http://www.wagner.pp.ru/~vitus/software/catdoc/
39
40 ## MS OOXML powerpoint to text conversion:
41 ## Ruby code written by Eric DUMINIL
Note: See TracBrowser for help on using the browser.