root/trunk/lib/picolena/templates/lib/plain_text_extractors/ms.excel.rb

Revision 263, 1.3 kB (checked in by eric.dumin..@gmail.com, 6 months ago)

Removing extra whitespaces at endline.

  • Property svn:executable set to *
Line 
1 #Excel 97-2003
2
3 PlainTextExtractor.new {
4   every :xls
5   as "application/excel"
6   aka "Microsoft Office Excel document"
7   with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux_and_mac_os,
8        "some other command" => :on_windows
9   which_should_for_example_extract 'Some text (should be indexed!)', :from => 'table.xls'
10 }
11
12 #Excel 2007
13
14 require 'zip/zip'
15 PlainTextExtractor.new {
16   every :xlsx
17   as 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
18   aka "Microsoft Office 2007 Excel spreadsheet"
19   with {|source|
20     Zip::ZipFile.open(source){|zipfile|
21       text_cells=zipfile.read("xl/sharedStrings.xml").split(/</).grep(/^t/).collect{|l|
22         l.sub(/^[^>]+>/,'')
23       }
24
25       sheet_names=zipfile.read("xl/workbook.xml").split(/</).grep(/^sheet /).collect{|l|
26         l.scan(/name="([^"]*)"/)
27       }
28       (sheet_names+text_cells).join("\n")
29     }
30   }
31   which_should_for_example_extract '<- this result should not be 100000!', :from => 'office2007-excel.xlsx'
32   or_extract 'Sheet name should be indexed!!!', :from => 'office2007-excel.xlsx'
33 }
34
35 ## Microsoft Excel to text conversion:
36 ##   Program: xls2csv
37 ##   Version tested: 0.37
38 ##   Installation: Ubuntu catdoc package
39 ##   Home page: http://www.winfield.demon.nl/
40
41 ## MS OOXML excel to text conversion:
42 ## Ruby code written by Eric DUMINIL
Note: See TracBrowser for help on using the browser.