| 1 |
PlainTextExtractor.new { |
|---|
| 2 |
every :txt, :text, :tex, :for, :cpp, :c, :rb, :ins, :vee, :java, :no_extension |
|---|
| 3 |
as "application/plain" |
|---|
| 4 |
aka "plain text file" |
|---|
| 5 |
with {|source| |
|---|
| 6 |
raise "binary file" unless File.plain_text?(source) |
|---|
| 7 |
encoding=File.encoding(source) |
|---|
| 8 |
if encoding.empty? then |
|---|
| 9 |
File.read(source) |
|---|
| 10 |
else |
|---|
| 11 |
%x{iconv -f #{encoding} -t utf8 "#{source}" 2>/dev/null} |
|---|
| 12 |
end |
|---|
| 13 |
} |
|---|
| 14 |
# for dependencies spec |
|---|
| 15 |
which_requires 'iconv' |
|---|
| 16 |
|
|---|
| 17 |
# to check if the extractor is working with basic plain text files |
|---|
| 18 |
which_should_for_example_extract 'Hello world!', :from => 'hello.rb' |
|---|
| 19 |
or_extract 'text inside!', :from => 'crossed.txt' |
|---|
| 20 |
or_extract 'txt inside!', :from => 'crossed.text' |
|---|
| 21 |
or_extract 'should index LaTeX too', :from => 'basic.tex' |
|---|
| 22 |
or_extract 'public static void main', :from => 'myfirstjavaprog.java' |
|---|
| 23 |
or_extract '"void CMatrix::inv()"', :from => 'ccmatrix1.cpp' |
|---|
| 24 |
or_extract 'CALL TEST(BOARD,X,Y,POSSIBLE)', :from=>'queens.for' |
|---|
| 25 |
or_extract 'This program calculates greatest common divisors', :from=>'gcd.c' |
|---|
| 26 |
or_extract 'p 3 square.dat', :from => 'square.ins' |
|---|
| 27 |
or_extract 'Do loop (global)', :from => 'xor.vee' |
|---|
| 28 |
|
|---|
| 29 |
# to check if other charsets are supported |
|---|
| 30 |
or_extract 'pÌöÌökÀöÃà AND ÃklÌöÌ', :from => 'utf-8.txt' |
|---|
| 31 |
or_extract 'Themenliste fÌr AdsorptionskÀlte', :from => 'iso-8859-1.txt' |
|---|
| 32 |
or_extract 'Fâ¬rnwÀrme', :from => 'iso-8859-15.txt' |
|---|
| 33 |
or_extract 'The previous line includes some weird chars. Will this file be indexed\?', :from => 'weird_chars.txt' |
|---|
| 34 |
or_extract 'Incidentally this file should get indexed as well', :from => 'README' |
|---|
| 35 |
or_extract 'OMG_thIs_Is_A_w3ird_fileN4mE!', :from => "'weird'filename.txt" |
|---|
| 36 |
} |
|---|