|
Revision 263, 1.3 kB
(checked in by eric.dumin..@gmail.com, 6 months ago)
|
Removing extra whitespaces at endline.
|
- Property svn:executable set to
*
|
| Line | |
|---|
| 1 |
#Excel 97-2003 |
|---|
| 2 |
|
|---|
| 3 |
PlainTextExtractor.new { |
|---|
| 4 |
every :xls |
|---|
| 5 |
as "application/excel" |
|---|
| 6 |
aka "Microsoft Office Excel document" |
|---|
| 7 |
with "xls2csv SOURCE 2>/dev/null | grep -i [a-z] | sed -e 's/\"//g' -e 's/,*$//' -e 's/,/ /g'" => :on_linux_and_mac_os, |
|---|
| 8 |
"some other command" => :on_windows |
|---|
| 9 |
which_should_for_example_extract 'Some text (should be indexed!)', :from => 'table.xls' |
|---|
| 10 |
} |
|---|
| 11 |
|
|---|
| 12 |
#Excel 2007 |
|---|
| 13 |
|
|---|
| 14 |
require 'zip/zip' |
|---|
| 15 |
PlainTextExtractor.new { |
|---|
| 16 |
every :xlsx |
|---|
| 17 |
as 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' |
|---|
| 18 |
aka "Microsoft Office 2007 Excel spreadsheet" |
|---|
| 19 |
with {|source| |
|---|
| 20 |
Zip::ZipFile.open(source){|zipfile| |
|---|
| 21 |
text_cells=zipfile.read("xl/sharedStrings.xml").split().grep().collect{|l| |
|---|
| 22 |
l.sub(,'') |
|---|
| 23 |
} |
|---|
| 24 |
|
|---|
| 25 |
sheet_names=zipfile.read("xl/workbook.xml").split().grep().collect{|l| |
|---|
| 26 |
l.scan() |
|---|
| 27 |
} |
|---|
| 28 |
(sheet_names+text_cells).join("\n") |
|---|
| 29 |
} |
|---|
| 30 |
} |
|---|
| 31 |
which_should_for_example_extract '<- this result should not be 100000!', :from => 'office2007-excel.xlsx' |
|---|
| 32 |
or_extract 'Sheet name should be indexed!!!', :from => 'office2007-excel.xlsx' |
|---|
| 33 |
} |
|---|
| 34 |
|
|---|
| 35 |
## Microsoft Excel to text conversion: |
|---|
| 36 |
## Program: xls2csv |
|---|
| 37 |
## Version tested: 0.37 |
|---|
| 38 |
## Installation: Ubuntu catdoc package |
|---|
| 39 |
## Home page: http://www.winfield.demon.nl/ |
|---|
| 40 |
|
|---|
| 41 |
## MS OOXML excel to text conversion: |
|---|
| 42 |
## Ruby code written by Eric DUMINIL |
|---|